lib/axt.c

Go to the documentation of this file.
00001 /* AXT - A simple alignment format with four lines per
00002  * alignment.  The first specifies the names of the
00003  * two sequences that align and the position of the
00004  * alignment, as well as the strand.  The next two
00005  * lines contain the alignment itself including dashes
00006  * for inserts.  The alignment is separated from the
00007  * next alignment with a blank line. 
00008  *
00009  * This file contains routines to read such alignments.
00010  * Note that though the coordinates are one based and
00011  * closed on disk, they get converted to our usual half
00012  * open zero based in memory. 
00013  *
00014  * This file is copyright 2002 Jim Kent, but license is hereby
00015  * granted for all use - public, private or commercial. */
00016 
00017 #include "common.h"
00018 #include "obscure.h"
00019 #include "linefile.h"
00020 #include "dnautil.h"
00021 #include "axt.h"
00022 
00023 static char const rcsid[] = "$Id: axt.c,v 1.47 2006/06/18 23:11:58 kate Exp $";
00024 
00025 void axtFree(struct axt **pEl)
00026 /* Free an axt. */
00027 {
00028 struct axt *el = *pEl;
00029 if (el != NULL)
00030     {
00031     freeMem(el->qName);
00032     freeMem(el->tName);
00033     freeMem(el->qSym);
00034     freeMem(el->tSym);
00035     freez(pEl);
00036     }
00037 }
00038 
00039 void axtFreeList(struct axt **pList)
00040 /* Free a list of dynamically allocated axt's */
00041 {
00042 struct axt *el, *next;
00043 
00044 for (el = *pList; el != NULL; el = next)
00045     {
00046     next = el->next;
00047     axtFree(&el);
00048     }
00049 *pList = NULL;
00050 }
00051 
00052 
00053 struct axt *axtReadWithPos(struct lineFile *lf, off_t *retOffset)
00054 /* Read next axt, and if retOffset is not-NULL, fill it with
00055  * offset of start of axt. */
00056 {
00057 char *words[10], *line;
00058 int wordCount, symCount;
00059 struct axt *axt;
00060 
00061 wordCount = lineFileChop(lf, words);
00062 if (retOffset != NULL)
00063     *retOffset = lineFileTell(lf);
00064 if (wordCount <= 0)
00065     return NULL;
00066 if (wordCount < 8)
00067     {
00068     errAbort("Expecting at least 8 words line %d of %s got %d\n", lf->lineIx, lf->fileName,
00069         wordCount);
00070     }
00071 AllocVar(axt);
00072 
00073 axt->qName = cloneString(words[4]);
00074 axt->qStart = lineFileNeedNum(lf, words, 5) - 1;
00075 axt->qEnd = lineFileNeedNum(lf, words, 6);
00076 axt->qStrand = words[7][0];
00077 axt->tName = cloneString(words[1]);
00078 axt->tStart = lineFileNeedNum(lf, words, 2) - 1;
00079 axt->tEnd = lineFileNeedNum(lf, words, 3);
00080 axt->tStrand = '+';
00081 if (wordCount > 8)
00082     axt->score = lineFileNeedNum(lf, words, 8);
00083 lineFileNeedNext(lf, &line, NULL);
00084 axt->symCount = symCount = strlen(line);
00085 axt->tSym = cloneMem(line, symCount+1);
00086 lineFileNeedNext(lf, &line, NULL);
00087 if (strlen(line) != symCount)
00088     errAbort("Symbol count %d != %d inconsistent between sequences line %d and prev line of %s",
00089         symCount, (int)strlen(line), lf->lineIx, lf->fileName);
00090 axt->qSym = cloneMem(line, symCount+1);
00091 lineFileNext(lf, &line, NULL);  /* Skip blank line */
00092 return axt;
00093 }
00094 
00095 struct axt *axtRead(struct lineFile *lf)
00096 /* Read in next record from .axt file and return it.
00097  * Returns NULL at EOF. */
00098 {
00099 return axtReadWithPos(lf, NULL);
00100 }
00101 
00102 void axtWrite(struct axt *axt, FILE *f)
00103 /* Output axt to axt file. */
00104 {
00105 static int ix = 0;
00106 fprintf(f, "%d %s %d %d %s %d %d %c",
00107         ix++, axt->tName, axt->tStart+1, axt->tEnd, 
00108         axt->qName, axt->qStart+1, axt->qEnd, axt->qStrand);
00109 fprintf(f, " %d", axt->score);
00110 fputc('\n', f);
00111 mustWrite(f, axt->tSym, axt->symCount);
00112 fputc('\n', f);
00113 mustWrite(f, axt->qSym, axt->symCount);
00114 fputc('\n', f);
00115 fputc('\n', f);
00116 if ((strlen(axt->tSym) != strlen(axt->qSym)) || (axt->symCount > strlen(axt->tSym)))
00117     fprintf(stderr,"Symbol count %d != %d || %d > %d inconsistent in %s in "
00118             "record %d.\n",
00119             (int)strlen(axt->qSym), (int)strlen(axt->tSym), axt->symCount,
00120             (int)strlen(axt->tSym), axt->qName, ix);
00121 }
00122 
00123 int axtCmpQuery(const void *va, const void *vb)
00124 /* Compare to sort based on query position. */
00125 {
00126 const struct axt *a = *((struct axt **)va);
00127 const struct axt *b = *((struct axt **)vb);
00128 int dif;
00129 dif = strcmp(a->qName, b->qName);
00130 if (dif == 0)
00131     dif = a->qStart - b->qStart;
00132 return dif;
00133 }
00134 
00135 int axtCmpTarget(const void *va, const void *vb)
00136 /* Compare to sort based on target position. */
00137 {
00138 const struct axt *a = *((struct axt **)va);
00139 const struct axt *b = *((struct axt **)vb);
00140 int dif;
00141 dif = strcmp(a->tName, b->tName);
00142 if (dif == 0)
00143     dif = a->tStart - b->tStart;
00144 return dif;
00145 }
00146 
00147 int axtCmpScore(const void *va, const void *vb)
00148 /* Compare to sort based on score. */
00149 {
00150 const struct axt *a = *((struct axt **)va);
00151 const struct axt *b = *((struct axt **)vb);
00152 return b->score - a->score;
00153 }
00154 
00155 int axtCmpTargetScoreDesc(const void *va, const void *vb)
00156 /* Compare to sort based on target name and score descending. */
00157 {
00158 const struct axt *a = *((struct axt **)va);
00159 const struct axt *b = *((struct axt **)vb);
00160 int dif;
00161 dif = strcmp(a->tName, b->tName);
00162 if (dif == 0)
00163     dif = b->score - a->score;
00164 return dif;
00165 }
00166 
00167 boolean axtCheck(struct axt *axt, struct lineFile *lf)
00168 /* Return FALSE if there's a problem with axt. */
00169 {
00170 int tSize = countNonDash(axt->tSym, axt->symCount);
00171 int qSize = countNonDash(axt->qSym, axt->symCount);
00172 if (tSize != axt->tEnd - axt->tStart)
00173     {
00174     warn("%d non-dashes, but %d bases to cover at line %d of %s", 
00175         tSize, axt->tEnd - axt->tStart, lf->lineIx, lf->fileName);
00176     return FALSE;
00177     }
00178 if (qSize != axt->qEnd - axt->qStart)
00179     {
00180     warn("%d non-dashes, but %d bases to cover at line %d of %s", 
00181         tSize, axt->qEnd - axt->qStart, lf->lineIx, lf->fileName);
00182     return FALSE;
00183     }
00184 return TRUE;
00185 }
00186 
00187 int axtScoreUngapped(struct axtScoreScheme *ss, char *q, char *t, int size)
00188 /* Score ungapped alignment. */
00189 {
00190 int score = 0;
00191 int i;
00192 for (i=0; i<size; ++i)
00193     score += ss->matrix[(int)q[i]][(int)t[i]];
00194 return score;
00195 }
00196 
00197 int axtScoreSym(struct axtScoreScheme *ss, int symCount, char *qSym, char *tSym)
00198 /* Return score without setting up an axt structure. */
00199 {
00200 int i;
00201 char q,t;
00202 int score = 0;
00203 boolean lastGap = FALSE;
00204 int gapStart = ss->gapOpen;
00205 int gapExt = ss->gapExtend;
00206 
00207 dnaUtilOpen();
00208 for (i=0; i<symCount; ++i)
00209     {
00210     q = qSym[i];
00211     t = tSym[i];
00212     if (q == '-' || t == '-')
00213         {
00214         if (lastGap)
00215             score -= gapExt;
00216         else
00217             {
00218             /* Use gapStart+gapExt to be consistent with blastz: */
00219             score -= (gapStart + gapExt);
00220             lastGap = TRUE;
00221             }
00222         }
00223     else
00224         {
00225         score += ss->matrix[(int)q][(int)t];
00226         lastGap = FALSE;
00227         }
00228     }
00229 return score;
00230 }
00231 
00232 boolean gapNotMasked(char q, char t)
00233 /* return true if gap on one side and upper case on other side */
00234 {
00235 if (q=='-' && t=='-')
00236     return FALSE;
00237 if (q=='-' && t<'a')
00238     return TRUE;
00239 if (t=='-' && q<'a')
00240     return TRUE;
00241 return FALSE;
00242 }
00243 
00244 
00245 int axtScoreSymFilterRepeats(struct axtScoreScheme *ss, int symCount, char *qSym, char *tSym)
00246 /* Return score without setting up an axt structure. Do not penalize gaps if repeat masked (lowercase)*/
00247 {
00248 int i;
00249 char q,t;
00250 int score = 0;
00251 boolean lastGap = FALSE;
00252 int gapStart = ss->gapOpen;
00253 int gapExt = ss->gapExtend;
00254 
00255 dnaUtilOpen();
00256 for (i=0; i<symCount; ++i)
00257     {
00258     q = qSym[i];
00259     t = tSym[i];
00260     if ((q == '-' || t == '-') && gapNotMasked(q,t))
00261         {
00262         if (lastGap)
00263             score -= gapExt;
00264         else
00265             {
00266             /* Use gapStart+gapExt to be consistent with blastz: */
00267             score -= (gapStart + gapExt);
00268             lastGap = TRUE;
00269             }
00270         }
00271     else
00272         {
00273         score += ss->matrix[(int)q][(int)t];
00274         lastGap = FALSE;
00275         }
00276     }
00277 return score;
00278 }
00279 
00280 int axtScoreFilterRepeats(struct axt *axt, struct axtScoreScheme *ss)
00281 /* Return calculated score of axt. */
00282 {
00283 return axtScoreSymFilterRepeats(ss, axt->symCount, axt->qSym, axt->tSym);
00284 }
00285 
00286 int axtScore(struct axt *axt, struct axtScoreScheme *ss)
00287 /* Return calculated score of axt. */
00288 {
00289 return axtScoreSym(ss, axt->symCount, axt->qSym, axt->tSym);
00290 }
00291 
00292 int axtScoreDnaDefault(struct axt *axt)
00293 /* Score DNA-based axt using default scheme. */
00294 {
00295 static struct axtScoreScheme *ss;
00296 if (ss == NULL)
00297     ss = axtScoreSchemeDefault();
00298 return axtScore(axt, ss);
00299 }
00300 
00301 int axtScoreProteinDefault(struct axt *axt)
00302 /* Score protein-based axt using default scheme. */
00303 {
00304 static struct axtScoreScheme *ss;
00305 if (ss == NULL)
00306     ss = axtScoreSchemeProteinDefault();
00307 return axtScore(axt, ss);
00308 }
00309 
00310 void axtSubsetOnT(struct axt *axt, int newStart, int newEnd, 
00311         struct axtScoreScheme *ss, FILE *f)
00312 /* Write out subset of axt that goes from newStart to newEnd
00313  * in target coordinates. */
00314 {
00315 if (newStart < axt->tStart)
00316     newStart = axt->tStart;
00317 if (newEnd > axt->tEnd)
00318     newEnd = axt->tEnd;
00319 if (newEnd <= newStart) 
00320     return;
00321 if (newStart == axt->tStart && newEnd == axt->tEnd)
00322     {
00323     axt->score = axtScore(axt, ss);
00324     axtWrite(axt, f);
00325     }
00326 else
00327     {
00328     struct axt a = *axt;
00329     char *tSymStart = skipIgnoringDash(a.tSym, newStart - a.tStart, TRUE);
00330     char *tSymEnd = skipIgnoringDash(tSymStart, newEnd - newStart, FALSE);
00331     int symCount = tSymEnd - tSymStart;
00332     char *qSymStart = a.qSym + (tSymStart - a.tSym);
00333     a.qStart += countNonDash(a.qSym, qSymStart - a.qSym);
00334     a.qEnd = a.qStart + countNonDash(qSymStart, symCount);
00335     a.tStart = newStart;
00336     a.tEnd = newEnd;
00337     a.symCount = symCount;
00338     a.qSym = qSymStart;
00339     a.tSym = tSymStart;
00340     a.score = axtScore(&a, ss);
00341     if (a.qStart < a.qEnd && a.tStart < a.tEnd)
00342         axtWrite(&a, f);
00343     }
00344 }
00345 
00346 int axtTransPosToQ(struct axt *axt, int tPos)
00347 /* Convert from t to q coordinates */
00348 {
00349 char *tSym = skipIgnoringDash(axt->tSym, tPos - axt->tStart, TRUE);
00350 int symIx = tSym - axt->tSym;
00351 int qPos = countNonDash(axt->qSym, symIx);
00352 return qPos + axt->qStart;
00353 }
00354 
00355 static void shortScoreScheme(struct lineFile *lf)
00356 /* Complain about score file being to short. */
00357 {
00358 errAbort("Scoring matrix file %s too short\n", lf->fileName);
00359 }
00360 
00361 static void propagateCase(struct axtScoreScheme *ss)
00362 /* Propagate score matrix from lower case to mixed case
00363  * in matrix. */
00364 {
00365 static int twoCase[2][4] = {{'a', 'c', 'g', 't'},{'A','C','G','T'},};
00366 int i1,i2,j1,j2;
00367 
00368 /* Propagate to other case combinations. */
00369 for (i1=0; i1<=1; ++i1)
00370     for (i2=0; i2<=1; ++i2)
00371        {
00372        if (i1 == 0 && i2 == 0)
00373            continue;
00374        for (j1=0; j1<4; ++j1)
00375            for (j2=0; j2<4; ++j2)
00376               {
00377               ss->matrix[twoCase[i1][j1]][twoCase[i2][j2]] = ss->matrix[twoCase[0][j1]][twoCase[0][j2]];
00378               }
00379        }
00380 }
00381 
00382 struct axtScoreScheme *axtScoreSchemeDefault()
00383 /* Return default scoring scheme (after blastz).  Do NOT axtScoreSchemeFree
00384  * this. */
00385 {
00386 static struct axtScoreScheme *ss;
00387 
00388 if (ss != NULL)
00389     return ss;
00390 AllocVar(ss);
00391 
00392 /* Set up lower case elements of matrix. */
00393 ss->matrix['a']['a'] = 91;
00394 ss->matrix['a']['c'] = -114;
00395 ss->matrix['a']['g'] = -31;
00396 ss->matrix['a']['t'] = -123;
00397 
00398 ss->matrix['c']['a'] = -114;
00399 ss->matrix['c']['c'] = 100;
00400 ss->matrix['c']['g'] = -125;
00401 ss->matrix['c']['t'] = -31;
00402 
00403 ss->matrix['g']['a'] = -31;
00404 ss->matrix['g']['c'] = -125;
00405 ss->matrix['g']['g'] = 100;
00406 ss->matrix['g']['t'] = -114;
00407 
00408 ss->matrix['t']['a'] = -123;
00409 ss->matrix['t']['c'] = -31;
00410 ss->matrix['t']['g'] = -114;
00411 ss->matrix['t']['t'] = 91;
00412 
00413 propagateCase(ss);
00414 ss->gapOpen = 400;
00415 ss->gapExtend = 30;
00416 return ss;
00417 }
00418 
00419 struct axtScoreScheme *axtScoreSchemeSimpleDna(int match, int misMatch, int gapOpen, int gapExtend)
00420 /* Return a relatively simple scoring scheme for DNA. */
00421 {
00422 static struct axtScoreScheme *ss;
00423 AllocVar(ss);
00424 
00425 /* Set up lower case elements of matrix. */
00426 ss->matrix['a']['a'] = match;
00427 ss->matrix['a']['c'] = -misMatch;
00428 ss->matrix['a']['g'] = -misMatch;
00429 ss->matrix['a']['t'] = -misMatch;
00430 
00431 ss->matrix['c']['a'] = -misMatch;
00432 ss->matrix['c']['c'] = match;
00433 ss->matrix['c']['g'] = -misMatch;
00434 ss->matrix['c']['t'] = -misMatch;
00435 
00436 ss->matrix['g']['a'] = -misMatch;
00437 ss->matrix['g']['c'] = -misMatch;
00438 ss->matrix['g']['g'] = match;
00439 ss->matrix['g']['t'] = -misMatch;
00440 
00441 ss->matrix['t']['a'] = -misMatch;
00442 ss->matrix['t']['c'] = -misMatch;
00443 ss->matrix['t']['g'] = -misMatch;
00444 ss->matrix['t']['t'] = match;
00445 
00446 propagateCase(ss);
00447 ss->gapOpen = gapOpen;
00448 ss->gapExtend = gapExtend;
00449 return ss;
00450 }
00451 
00452 struct axtScoreScheme *axtScoreSchemeRnaDefault()
00453 /* Return default scoring scheme for RNA/DNA alignments
00454  * within the same species.  Do NOT axtScoreSchemeFree */
00455 {
00456 static struct axtScoreScheme *ss;
00457 if (ss == NULL)
00458     ss = axtScoreSchemeSimpleDna(100, 200, 300, 300);
00459 return ss;
00460 }
00461 
00462 struct axtScoreScheme *axtScoreSchemeRnaFill()
00463 /* Return scoreing scheme a little more relaxed than 
00464  * RNA/DNA defaults for filling in gaps. */
00465 {
00466 static struct axtScoreScheme *ss;
00467 if (ss == NULL)
00468     ss = axtScoreSchemeSimpleDna(100, 100, 200, 200);
00469 return ss;
00470 }
00471 
00472 struct axtScoreScheme *axtScoreSchemeFromBlastzMatrix(char *text, int gapOpen, int gapExtend)
00473 /* return scoring schema from a string in the following format */
00474 /* 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91 */
00475 {
00476 char *matrixDna[32];
00477 struct axtScoreScheme *ss = axtScoreSchemeDefault();
00478 int matrixSize = chopString(text, ",", matrixDna, 32);
00479 if (matrixSize != 16)
00480     return ss;
00481 if (ss == NULL)
00482     return NULL;
00483 ss->gapOpen = gapOpen;
00484 ss->gapExtend = gapExtend;
00485 ss->matrix['a']['a'] = atoi(matrixDna[0]);
00486 ss->matrix['a']['c'] = atoi(matrixDna[1]);
00487 ss->matrix['a']['g'] = atoi(matrixDna[2]);
00488 ss->matrix['a']['t'] = atoi(matrixDna[3]);
00489 
00490 ss->matrix['c']['a'] = atoi(matrixDna[4]);
00491 ss->matrix['c']['c'] = atoi(matrixDna[5]);
00492 ss->matrix['c']['g'] = atoi(matrixDna[6]);
00493 ss->matrix['c']['t'] = atoi(matrixDna[7]);
00494 
00495 ss->matrix['g']['a'] = atoi(matrixDna[8]);
00496 ss->matrix['g']['c'] = atoi(matrixDna[9]);
00497 ss->matrix['g']['g'] = atoi(matrixDna[10]);
00498 ss->matrix['g']['t'] = atoi(matrixDna[11]);
00499 
00500 ss->matrix['t']['a'] = atoi(matrixDna[12]);
00501 ss->matrix['t']['c'] = atoi(matrixDna[13]);
00502 ss->matrix['t']['g'] = atoi(matrixDna[14]);
00503 ss->matrix['t']['t'] = atoi(matrixDna[15]);
00504 return ss;
00505 }
00506 
00507 char blosumText[] = {
00508 "#  Matrix made by matblas from blosum62.iij\n"
00509 "#  * column uses minimum score\n"
00510 "#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\n"
00511 "#  Blocks Database = /data/blocks_5.0/blocks.dat\n"
00512 "#  Cluster Percentage: >= 62\n"
00513 "#  Entropy =   0.6979, Expected =  -0.5209\n"
00514 "   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *\n"
00515 "A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4 \n"
00516 "R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4 \n"
00517 "N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4 \n"
00518 "D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4 \n"
00519 "C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 \n"
00520 "Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4 \n"
00521 "E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 \n"
00522 "G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4 \n"
00523 "H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4 \n"
00524 "I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4 \n"
00525 "L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4 \n"
00526 "K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4 \n"
00527 "M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4 \n"
00528 "F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4 \n"
00529 "P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4 \n"
00530 "S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4 \n"
00531 "T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4 \n"
00532 "W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4 \n"
00533 "Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4 \n"
00534 "V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4 \n"
00535 "B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4 \n"
00536 "Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 \n"
00537 "X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4 \n"
00538 "* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1 \n"
00539 };
00540 
00541 static void badProteinMatrixLine(int lineIx, char *fileName)
00542 /* Explain line syntax for protein matrix and abort */
00543 {
00544 errAbort("Expecting letter and 25 numbers line %d of %s", lineIx, fileName);
00545 }
00546 
00547 struct axtScoreScheme *axtScoreSchemeFromProteinText(char *text, char *fileName)
00548 /* Parse text into a scoring scheme.  This should be in BLAST protein matrix
00549  * format as in blosumText above. */
00550 {
00551 char *line, *nextLine;
00552 int lineIx = 0;
00553 int realCount = 0;
00554 char columns[24];
00555 char *row[25];
00556 int i;
00557 struct axtScoreScheme *ss;
00558 
00559 AllocVar(ss);
00560 for (line = text; line != NULL; line = nextLine)
00561     {
00562     nextLine = strchr(line, '\n');
00563     if (nextLine != NULL)
00564         *nextLine++ = 0;
00565     ++lineIx;
00566     line = skipLeadingSpaces(line);
00567     if (line[0] == '#' || line[0] == 0)
00568         continue;
00569     ++realCount;
00570     if (realCount == 1)
00571         {
00572         int wordCount = chopLine(line, row);
00573         if (wordCount != 24)
00574             errAbort("Not a good protein matrix - expecting 24 letters line %d of %s", lineIx, fileName);
00575         for (i=0; i<wordCount; ++i)
00576             {
00577             char *letter = row[i];
00578             if (strlen(letter) != 1)
00579                 errAbort("Not a good protein matrix - got word not letter line %d of %s", lineIx, fileName);
00580             columns[i] = letter[0];
00581             }
00582         }
00583     else
00584         {
00585         int wordCount = chopLine(line, row);
00586         char letter, lcLetter;
00587         if (wordCount != 25)
00588             badProteinMatrixLine(lineIx, fileName);
00589         letter = row[0][0];
00590         if (strlen(row[0]) != 1 || isdigit(letter))
00591             badProteinMatrixLine(lineIx, fileName);
00592         lcLetter = tolower(letter);
00593         for (i=1; i<wordCount; ++i)
00594             {
00595             char *s = row[i];
00596             int val;
00597             char otherLetter, lcOtherLetter;
00598             if (s[0] == '-') ++s;
00599             if (!isdigit(s[0]))
00600                 badProteinMatrixLine(lineIx, fileName);
00601             otherLetter = columns[i-1];
00602             lcOtherLetter = tolower(otherLetter);
00603             val = atoi(row[i]);
00604             ss->matrix[(int)letter][(int)otherLetter] = val;
00605             ss->matrix[(int)lcLetter][(int)otherLetter] = val;
00606             ss->matrix[(int)letter][(int)lcOtherLetter] = val;
00607             ss->matrix[(int)lcLetter][(int)lcOtherLetter] = val;
00608             }
00609         }
00610     }
00611 if (realCount < 25)
00612     errAbort("Unexpected end of %s", fileName);
00613 return ss;
00614 }
00615 
00616 struct axtScoreScheme *axtScoreSchemeProteinDefault()
00617 /* Returns default protein scoring scheme.  This is
00618  * scaled to be compatible with the blastz one. */
00619 {
00620 static struct axtScoreScheme *ss;
00621 int i,j;
00622 if (ss != NULL)
00623     return ss;
00624 ss = axtScoreSchemeFromProteinText(blosumText, "blosum62");
00625 for (i=0; i<128; ++i)
00626     for (j=0; j<128; ++j)
00627         ss->matrix[i][j] *= 19;
00628 ss->gapOpen = 11 * 19;
00629 ss->gapExtend = 1 * 19;
00630 return ss;
00631 }
00632 
00633 void axtScoreSchemeFree(struct axtScoreScheme **pObj)
00634 /* Free up score scheme. */
00635 {
00636 freez(pObj);
00637 }
00638 
00639 struct axtScoreScheme *axtScoreSchemeProteinRead(char *fileName)
00640 {
00641 char *string;
00642 struct axtScoreScheme *ss;
00643 
00644 readInGulp(fileName, &string, NULL);
00645 ss = axtScoreSchemeFromProteinText(string, fileName);
00646 freeMem(string);
00647 
00648 return ss;
00649 }
00650 
00651 struct axtScoreScheme *axtScoreSchemeReadLf(struct lineFile *lf )
00652 /* Read in scoring scheme from file. Looks like
00653     A    C    G    T
00654     91 -114  -31 -123
00655     -114  100 -125  -31
00656     -31 -125  100 -114
00657     -123  -31 -114   91
00658     O = 400, E = 30
00659 */
00660 {
00661 char *line, *row[4], *parts[32];
00662 int i,j, partCount;
00663 struct axtScoreScheme *ss;
00664 boolean gotO = FALSE, gotE = FALSE;
00665 static int trans[4] = {'a', 'c', 'g', 't'};
00666 
00667 AllocVar(ss);
00668 ss->extra = NULL;
00669 if (!lineFileRow(lf, row))
00670     shortScoreScheme(lf);
00671 if (row[0][0] != 'A' || row[1][0] != 'C' || row[2][0] != 'G' 
00672         || row[3][0] != 'T')
00673     errAbort("%s doesn't seem to be a score matrix file", lf->fileName);
00674 for (i=0; i<4; ++i)
00675     {
00676     if (!lineFileRow(lf, row))
00677         shortScoreScheme(lf);
00678     for (j=0; j<4; ++j)
00679         ss->matrix[trans[i]][trans[j]] = lineFileNeedNum(lf, row, j);
00680     }
00681 if (lineFileNext(lf, &line, NULL))
00682     {
00683     ss->extra = cloneString(line);
00684     partCount = chopString(line, " =,\t", parts, ArraySize(parts));
00685     for (i=0; i<partCount-1; i += 2)
00686         {
00687         if (sameString(parts[i], "O"))
00688             {
00689             gotO = TRUE;
00690             ss->gapOpen = atoi(parts[i+1]);
00691             }
00692         if (sameString(parts[i], "E"))
00693             {
00694             gotE = TRUE;
00695             ss->gapExtend = atoi(parts[i+1]);
00696             }
00697         }
00698     if (!gotO || !gotE)
00699         errAbort("Expecting O = and E = in last line of %s", lf->fileName);
00700     if (ss->gapOpen <= 0 || ss->gapExtend <= 0)
00701         errAbort("Must have positive gap scores");
00702     }
00703 else
00704     {
00705     ss->gapOpen = 400;
00706     ss->gapExtend = 30;
00707     }
00708 propagateCase(ss);
00709 return ss;
00710 }
00711 
00712 struct axtScoreScheme *axtScoreSchemeRead(char *fileName)
00713 /* Read in scoring scheme from file. Looks like
00714     A    C    G    T
00715     91 -114  -31 -123
00716     -114  100 -125  -31
00717     -31 -125  100 -114
00718     -123  -31 -114   91
00719     O = 400, E = 30
00720 */
00721 {
00722 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00723 struct axtScoreScheme *ss = axtScoreSchemeReadLf(lf);
00724 return ss;
00725 }
00726 
00727 void axtScoreSchemeDnaWrite(struct axtScoreScheme *ss, FILE *f, char *name)
00728 /* output the score dna based score matrix in meta Data format to File f,
00729 name should be set to the name of the program that is using the matrix */
00730 {
00731 if (ss == NULL)
00732     return;
00733 if (f == NULL)
00734     return;
00735 fprintf(f, "##matrix=%s 16 %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
00736         name,
00737     ss->matrix['a']['a'],
00738     ss->matrix['a']['c'],
00739     ss->matrix['a']['g'],
00740     ss->matrix['a']['t'],
00741 
00742     ss->matrix['c']['a'],
00743     ss->matrix['c']['c'],
00744     ss->matrix['c']['g'],
00745     ss->matrix['c']['t'],
00746 
00747     ss->matrix['g']['a'],
00748     ss->matrix['g']['c'],
00749     ss->matrix['g']['g'],
00750     ss->matrix['g']['t'],
00751 
00752     ss->matrix['t']['a'],
00753     ss->matrix['t']['c'],
00754     ss->matrix['t']['g'],
00755     ss->matrix['t']['t']);
00756 fprintf(f, "##gapPenalties=%s O=%d E=%d\n", name, ss->gapOpen, ss->gapExtend);
00757 if (ss->extra!=NULL)
00758     {
00759     stripChar(ss->extra,' ');
00760     stripChar(ss->extra,'"');
00761     fprintf(f, "##blastzParms=%s\n", ss->extra);
00762     }
00763 }
00764 
00765 void axtSwap(struct axt *axt, int tSize, int qSize)
00766 /* Flip target and query on one axt. */
00767 {
00768 struct axt old = *axt;
00769 
00770 /* Copy non-strand dependent stuff */
00771 axt->qName = old.tName;
00772 axt->tName = old.qName;
00773 axt->qSym = old.tSym;
00774 axt->tSym = old.qSym;
00775 axt->qStart = old.tStart;
00776 axt->qEnd = old.tEnd;
00777 axt->tStart = old.qStart;
00778 axt->tEnd = old.qEnd;
00779 
00780 /* Copy strand dependent stuff. */
00781 assert(axt->tStrand != '-');
00782 
00783 if (axt->qStrand == '-')
00784     {
00785     /* axt's are really set up so that the target is on the
00786      * + strand and the query is on the minus strand.
00787      * Therefore we need to reverse complement both 
00788      * strands while swapping to preserve this. */
00789     reverseIntRange(&axt->tStart, &axt->tEnd, qSize);
00790     reverseIntRange(&axt->qStart, &axt->qEnd, tSize);
00791     reverseComplement(axt->qSym, axt->symCount);
00792     reverseComplement(axt->tSym, axt->symCount);
00793     }
00794 }
00795 
00796 void axtBundleFree(struct axtBundle **pObj)
00797 /* Free a axtBundle. */
00798 {
00799 struct axtBundle *obj = *pObj;
00800 if (obj != NULL)
00801     {
00802     axtFreeList(&obj->axtList);
00803     freez(pObj);
00804     }
00805 }
00806 
00807 void axtBundleFreeList(struct axtBundle **pList)
00808 /* Free a list of axtBundles. */
00809 {
00810 struct axtBundle *el, *next;
00811 
00812 for (el = *pList; el != NULL; el = next)
00813     {
00814     next = el->next;
00815     axtBundleFree(&el);
00816     }
00817 *pList = NULL;
00818 }
00819 
00820 void axtAddBlocksToBoxInList(struct cBlock **pList, struct axt *axt)
00821 /* Add blocks (gapless subalignments) from (non-NULL!) axt to block list. 
00822  * Note: list will be in reverse order of axt blocks. */
00823 {
00824 boolean thisIn, lastIn = FALSE;
00825 int qPos = axt->qStart, tPos = axt->tStart;
00826 int qStart = 0, tStart = 0;
00827 int i;
00828 
00829 for (i=0; i<=axt->symCount; ++i)
00830     {
00831     int advanceQ = (isalpha(axt->qSym[i]) ? 1 : 0);
00832     int advanceT = (isalpha(axt->tSym[i]) ? 1 : 0);
00833     thisIn = (advanceQ && advanceT);
00834     if (thisIn)
00835         {
00836         if (!lastIn)
00837             {
00838             qStart = qPos;
00839             tStart = tPos;
00840             }
00841         }
00842     else
00843         {
00844         if (lastIn)
00845             {
00846             int size = qPos - qStart;
00847             assert(size == tPos - tStart);
00848             if (size > 0)
00849                 {
00850                 struct cBlock *b;
00851                 AllocVar(b);
00852                 b->qStart = qStart;
00853                 b->qEnd = qPos;
00854                 b->tStart = tStart;
00855                 b->tEnd = tPos;
00856                 slAddHead(pList, b);
00857                 }
00858             }
00859         }
00860     lastIn = thisIn;
00861     qPos += advanceQ;
00862     tPos += advanceT;
00863     }
00864 }
00865 
00866 void axtPrintTraditional(struct axt *axt, int maxLine, struct axtScoreScheme *ss, FILE *f)
00867 /* Print out an alignment with line-breaks. */
00868 {
00869 int qPos = axt->qStart;
00870 int tPos = axt->tStart;
00871 int symPos;
00872 int aDigits = digitsBaseTen(axt->qEnd);
00873 int bDigits = digitsBaseTen(axt->tEnd);
00874 int digits = max(aDigits, bDigits);
00875 
00876 for (symPos = 0; symPos < axt->symCount; symPos += maxLine)
00877     {
00878     /* Figure out which part of axt to use for this line. */
00879     int lineSize = axt->symCount - symPos;
00880     int lineEnd, i;
00881     if (lineSize > maxLine)
00882         lineSize = maxLine;
00883     lineEnd = symPos + lineSize;
00884 
00885     /* Draw query line including numbers. */
00886     fprintf(f, "%0*d ", digits, qPos+1);
00887     for (i=symPos; i<lineEnd; ++i)
00888         {
00889         char c = axt->qSym[i];
00890         fputc(c, f);
00891         if (c != '.' && c != '-')
00892             ++qPos;
00893         }
00894     fprintf(f, " %0*d\n", digits, qPos);
00895 
00896     /* Draw line with match/mismatch symbols. */
00897     spaceOut(f, digits+1);
00898     for (i=symPos; i<lineEnd; ++i)
00899         {
00900         char q = axt->qSym[i];
00901         char t = axt->tSym[i];
00902         char out = ' ';
00903         if (q == t)
00904             out = '|';
00905         else if (ss != NULL && ss->matrix[(int)q][(int)t] > 0)
00906             out = '+';
00907         fputc(out, f);
00908         }
00909     fputc('\n', f);
00910 
00911     /* Draw target line including numbers. */
00912     fprintf(f, "%0*d ", digits, tPos+1);
00913     for (i=symPos; i<lineEnd; ++i)
00914         {
00915         char c = axt->tSym[i];
00916         fputc(c, f);
00917         if (c != '.' && c != '-')
00918             ++tPos;
00919         }
00920     fprintf(f, " %0*d\n", digits, tPos);
00921 
00922     /* Draw extra empty line. */
00923     fputc('\n', f);
00924     }
00925 }
00926 
00927 double axtIdWithGaps(struct axt *axt)
00928 /* Return ratio of matching bases to total symbols in alignment. */
00929 {
00930 int i;
00931 int matchCount = 0;
00932 for (i=0; i<axt->symCount; ++i)
00933     {
00934     if (toupper(axt->qSym[i]) == toupper(axt->tSym[i]))
00935         ++matchCount;
00936     }
00937 return (double)matchCount/axt->symCount;
00938 }
00939 
00940 double axtCoverage(struct axt *axt, int qSize, int tSize)
00941 /* Return % of q and t covered. */
00942 {
00943 double cov = axt->tEnd - axt->tStart + axt->qEnd - axt->qStart;
00944 return cov/(qSize+tSize);
00945 }
00946 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2