00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 #include "common.h"
00018 #include "obscure.h"
00019 #include "linefile.h"
00020 #include "dnautil.h"
00021 #include "axt.h"
00022
00023 static char const rcsid[] = "$Id: axt.c,v 1.47 2006/06/18 23:11:58 kate Exp $";
00024
00025 void axtFree(struct axt **pEl)
00026
00027 {
00028 struct axt *el = *pEl;
00029 if (el != NULL)
00030 {
00031 freeMem(el->qName);
00032 freeMem(el->tName);
00033 freeMem(el->qSym);
00034 freeMem(el->tSym);
00035 freez(pEl);
00036 }
00037 }
00038
00039 void axtFreeList(struct axt **pList)
00040
00041 {
00042 struct axt *el, *next;
00043
00044 for (el = *pList; el != NULL; el = next)
00045 {
00046 next = el->next;
00047 axtFree(&el);
00048 }
00049 *pList = NULL;
00050 }
00051
00052
00053 struct axt *axtReadWithPos(struct lineFile *lf, off_t *retOffset)
00054
00055
00056 {
00057 char *words[10], *line;
00058 int wordCount, symCount;
00059 struct axt *axt;
00060
00061 wordCount = lineFileChop(lf, words);
00062 if (retOffset != NULL)
00063 *retOffset = lineFileTell(lf);
00064 if (wordCount <= 0)
00065 return NULL;
00066 if (wordCount < 8)
00067 {
00068 errAbort("Expecting at least 8 words line %d of %s got %d\n", lf->lineIx, lf->fileName,
00069 wordCount);
00070 }
00071 AllocVar(axt);
00072
00073 axt->qName = cloneString(words[4]);
00074 axt->qStart = lineFileNeedNum(lf, words, 5) - 1;
00075 axt->qEnd = lineFileNeedNum(lf, words, 6);
00076 axt->qStrand = words[7][0];
00077 axt->tName = cloneString(words[1]);
00078 axt->tStart = lineFileNeedNum(lf, words, 2) - 1;
00079 axt->tEnd = lineFileNeedNum(lf, words, 3);
00080 axt->tStrand = '+';
00081 if (wordCount > 8)
00082 axt->score = lineFileNeedNum(lf, words, 8);
00083 lineFileNeedNext(lf, &line, NULL);
00084 axt->symCount = symCount = strlen(line);
00085 axt->tSym = cloneMem(line, symCount+1);
00086 lineFileNeedNext(lf, &line, NULL);
00087 if (strlen(line) != symCount)
00088 errAbort("Symbol count %d != %d inconsistent between sequences line %d and prev line of %s",
00089 symCount, (int)strlen(line), lf->lineIx, lf->fileName);
00090 axt->qSym = cloneMem(line, symCount+1);
00091 lineFileNext(lf, &line, NULL);
00092 return axt;
00093 }
00094
00095 struct axt *axtRead(struct lineFile *lf)
00096
00097
00098 {
00099 return axtReadWithPos(lf, NULL);
00100 }
00101
00102 void axtWrite(struct axt *axt, FILE *f)
00103
00104 {
00105 static int ix = 0;
00106 fprintf(f, "%d %s %d %d %s %d %d %c",
00107 ix++, axt->tName, axt->tStart+1, axt->tEnd,
00108 axt->qName, axt->qStart+1, axt->qEnd, axt->qStrand);
00109 fprintf(f, " %d", axt->score);
00110 fputc('\n', f);
00111 mustWrite(f, axt->tSym, axt->symCount);
00112 fputc('\n', f);
00113 mustWrite(f, axt->qSym, axt->symCount);
00114 fputc('\n', f);
00115 fputc('\n', f);
00116 if ((strlen(axt->tSym) != strlen(axt->qSym)) || (axt->symCount > strlen(axt->tSym)))
00117 fprintf(stderr,"Symbol count %d != %d || %d > %d inconsistent in %s in "
00118 "record %d.\n",
00119 (int)strlen(axt->qSym), (int)strlen(axt->tSym), axt->symCount,
00120 (int)strlen(axt->tSym), axt->qName, ix);
00121 }
00122
00123 int axtCmpQuery(const void *va, const void *vb)
00124
00125 {
00126 const struct axt *a = *((struct axt **)va);
00127 const struct axt *b = *((struct axt **)vb);
00128 int dif;
00129 dif = strcmp(a->qName, b->qName);
00130 if (dif == 0)
00131 dif = a->qStart - b->qStart;
00132 return dif;
00133 }
00134
00135 int axtCmpTarget(const void *va, const void *vb)
00136
00137 {
00138 const struct axt *a = *((struct axt **)va);
00139 const struct axt *b = *((struct axt **)vb);
00140 int dif;
00141 dif = strcmp(a->tName, b->tName);
00142 if (dif == 0)
00143 dif = a->tStart - b->tStart;
00144 return dif;
00145 }
00146
00147 int axtCmpScore(const void *va, const void *vb)
00148
00149 {
00150 const struct axt *a = *((struct axt **)va);
00151 const struct axt *b = *((struct axt **)vb);
00152 return b->score - a->score;
00153 }
00154
00155 int axtCmpTargetScoreDesc(const void *va, const void *vb)
00156
00157 {
00158 const struct axt *a = *((struct axt **)va);
00159 const struct axt *b = *((struct axt **)vb);
00160 int dif;
00161 dif = strcmp(a->tName, b->tName);
00162 if (dif == 0)
00163 dif = b->score - a->score;
00164 return dif;
00165 }
00166
00167 boolean axtCheck(struct axt *axt, struct lineFile *lf)
00168
00169 {
00170 int tSize = countNonDash(axt->tSym, axt->symCount);
00171 int qSize = countNonDash(axt->qSym, axt->symCount);
00172 if (tSize != axt->tEnd - axt->tStart)
00173 {
00174 warn("%d non-dashes, but %d bases to cover at line %d of %s",
00175 tSize, axt->tEnd - axt->tStart, lf->lineIx, lf->fileName);
00176 return FALSE;
00177 }
00178 if (qSize != axt->qEnd - axt->qStart)
00179 {
00180 warn("%d non-dashes, but %d bases to cover at line %d of %s",
00181 tSize, axt->qEnd - axt->qStart, lf->lineIx, lf->fileName);
00182 return FALSE;
00183 }
00184 return TRUE;
00185 }
00186
00187 int axtScoreUngapped(struct axtScoreScheme *ss, char *q, char *t, int size)
00188
00189 {
00190 int score = 0;
00191 int i;
00192 for (i=0; i<size; ++i)
00193 score += ss->matrix[(int)q[i]][(int)t[i]];
00194 return score;
00195 }
00196
00197 int axtScoreSym(struct axtScoreScheme *ss, int symCount, char *qSym, char *tSym)
00198
00199 {
00200 int i;
00201 char q,t;
00202 int score = 0;
00203 boolean lastGap = FALSE;
00204 int gapStart = ss->gapOpen;
00205 int gapExt = ss->gapExtend;
00206
00207 dnaUtilOpen();
00208 for (i=0; i<symCount; ++i)
00209 {
00210 q = qSym[i];
00211 t = tSym[i];
00212 if (q == '-' || t == '-')
00213 {
00214 if (lastGap)
00215 score -= gapExt;
00216 else
00217 {
00218
00219 score -= (gapStart + gapExt);
00220 lastGap = TRUE;
00221 }
00222 }
00223 else
00224 {
00225 score += ss->matrix[(int)q][(int)t];
00226 lastGap = FALSE;
00227 }
00228 }
00229 return score;
00230 }
00231
00232 boolean gapNotMasked(char q, char t)
00233
00234 {
00235 if (q=='-' && t=='-')
00236 return FALSE;
00237 if (q=='-' && t<'a')
00238 return TRUE;
00239 if (t=='-' && q<'a')
00240 return TRUE;
00241 return FALSE;
00242 }
00243
00244
00245 int axtScoreSymFilterRepeats(struct axtScoreScheme *ss, int symCount, char *qSym, char *tSym)
00246
00247 {
00248 int i;
00249 char q,t;
00250 int score = 0;
00251 boolean lastGap = FALSE;
00252 int gapStart = ss->gapOpen;
00253 int gapExt = ss->gapExtend;
00254
00255 dnaUtilOpen();
00256 for (i=0; i<symCount; ++i)
00257 {
00258 q = qSym[i];
00259 t = tSym[i];
00260 if ((q == '-' || t == '-') && gapNotMasked(q,t))
00261 {
00262 if (lastGap)
00263 score -= gapExt;
00264 else
00265 {
00266
00267 score -= (gapStart + gapExt);
00268 lastGap = TRUE;
00269 }
00270 }
00271 else
00272 {
00273 score += ss->matrix[(int)q][(int)t];
00274 lastGap = FALSE;
00275 }
00276 }
00277 return score;
00278 }
00279
00280 int axtScoreFilterRepeats(struct axt *axt, struct axtScoreScheme *ss)
00281
00282 {
00283 return axtScoreSymFilterRepeats(ss, axt->symCount, axt->qSym, axt->tSym);
00284 }
00285
00286 int axtScore(struct axt *axt, struct axtScoreScheme *ss)
00287
00288 {
00289 return axtScoreSym(ss, axt->symCount, axt->qSym, axt->tSym);
00290 }
00291
00292 int axtScoreDnaDefault(struct axt *axt)
00293
00294 {
00295 static struct axtScoreScheme *ss;
00296 if (ss == NULL)
00297 ss = axtScoreSchemeDefault();
00298 return axtScore(axt, ss);
00299 }
00300
00301 int axtScoreProteinDefault(struct axt *axt)
00302
00303 {
00304 static struct axtScoreScheme *ss;
00305 if (ss == NULL)
00306 ss = axtScoreSchemeProteinDefault();
00307 return axtScore(axt, ss);
00308 }
00309
00310 void axtSubsetOnT(struct axt *axt, int newStart, int newEnd,
00311 struct axtScoreScheme *ss, FILE *f)
00312
00313
00314 {
00315 if (newStart < axt->tStart)
00316 newStart = axt->tStart;
00317 if (newEnd > axt->tEnd)
00318 newEnd = axt->tEnd;
00319 if (newEnd <= newStart)
00320 return;
00321 if (newStart == axt->tStart && newEnd == axt->tEnd)
00322 {
00323 axt->score = axtScore(axt, ss);
00324 axtWrite(axt, f);
00325 }
00326 else
00327 {
00328 struct axt a = *axt;
00329 char *tSymStart = skipIgnoringDash(a.tSym, newStart - a.tStart, TRUE);
00330 char *tSymEnd = skipIgnoringDash(tSymStart, newEnd - newStart, FALSE);
00331 int symCount = tSymEnd - tSymStart;
00332 char *qSymStart = a.qSym + (tSymStart - a.tSym);
00333 a.qStart += countNonDash(a.qSym, qSymStart - a.qSym);
00334 a.qEnd = a.qStart + countNonDash(qSymStart, symCount);
00335 a.tStart = newStart;
00336 a.tEnd = newEnd;
00337 a.symCount = symCount;
00338 a.qSym = qSymStart;
00339 a.tSym = tSymStart;
00340 a.score = axtScore(&a, ss);
00341 if (a.qStart < a.qEnd && a.tStart < a.tEnd)
00342 axtWrite(&a, f);
00343 }
00344 }
00345
00346 int axtTransPosToQ(struct axt *axt, int tPos)
00347
00348 {
00349 char *tSym = skipIgnoringDash(axt->tSym, tPos - axt->tStart, TRUE);
00350 int symIx = tSym - axt->tSym;
00351 int qPos = countNonDash(axt->qSym, symIx);
00352 return qPos + axt->qStart;
00353 }
00354
00355 static void shortScoreScheme(struct lineFile *lf)
00356
00357 {
00358 errAbort("Scoring matrix file %s too short\n", lf->fileName);
00359 }
00360
00361 static void propagateCase(struct axtScoreScheme *ss)
00362
00363
00364 {
00365 static int twoCase[2][4] = {{'a', 'c', 'g', 't'},{'A','C','G','T'},};
00366 int i1,i2,j1,j2;
00367
00368
00369 for (i1=0; i1<=1; ++i1)
00370 for (i2=0; i2<=1; ++i2)
00371 {
00372 if (i1 == 0 && i2 == 0)
00373 continue;
00374 for (j1=0; j1<4; ++j1)
00375 for (j2=0; j2<4; ++j2)
00376 {
00377 ss->matrix[twoCase[i1][j1]][twoCase[i2][j2]] = ss->matrix[twoCase[0][j1]][twoCase[0][j2]];
00378 }
00379 }
00380 }
00381
00382 struct axtScoreScheme *axtScoreSchemeDefault()
00383
00384
00385 {
00386 static struct axtScoreScheme *ss;
00387
00388 if (ss != NULL)
00389 return ss;
00390 AllocVar(ss);
00391
00392
00393 ss->matrix['a']['a'] = 91;
00394 ss->matrix['a']['c'] = -114;
00395 ss->matrix['a']['g'] = -31;
00396 ss->matrix['a']['t'] = -123;
00397
00398 ss->matrix['c']['a'] = -114;
00399 ss->matrix['c']['c'] = 100;
00400 ss->matrix['c']['g'] = -125;
00401 ss->matrix['c']['t'] = -31;
00402
00403 ss->matrix['g']['a'] = -31;
00404 ss->matrix['g']['c'] = -125;
00405 ss->matrix['g']['g'] = 100;
00406 ss->matrix['g']['t'] = -114;
00407
00408 ss->matrix['t']['a'] = -123;
00409 ss->matrix['t']['c'] = -31;
00410 ss->matrix['t']['g'] = -114;
00411 ss->matrix['t']['t'] = 91;
00412
00413 propagateCase(ss);
00414 ss->gapOpen = 400;
00415 ss->gapExtend = 30;
00416 return ss;
00417 }
00418
00419 struct axtScoreScheme *axtScoreSchemeSimpleDna(int match, int misMatch, int gapOpen, int gapExtend)
00420
00421 {
00422 static struct axtScoreScheme *ss;
00423 AllocVar(ss);
00424
00425
00426 ss->matrix['a']['a'] = match;
00427 ss->matrix['a']['c'] = -misMatch;
00428 ss->matrix['a']['g'] = -misMatch;
00429 ss->matrix['a']['t'] = -misMatch;
00430
00431 ss->matrix['c']['a'] = -misMatch;
00432 ss->matrix['c']['c'] = match;
00433 ss->matrix['c']['g'] = -misMatch;
00434 ss->matrix['c']['t'] = -misMatch;
00435
00436 ss->matrix['g']['a'] = -misMatch;
00437 ss->matrix['g']['c'] = -misMatch;
00438 ss->matrix['g']['g'] = match;
00439 ss->matrix['g']['t'] = -misMatch;
00440
00441 ss->matrix['t']['a'] = -misMatch;
00442 ss->matrix['t']['c'] = -misMatch;
00443 ss->matrix['t']['g'] = -misMatch;
00444 ss->matrix['t']['t'] = match;
00445
00446 propagateCase(ss);
00447 ss->gapOpen = gapOpen;
00448 ss->gapExtend = gapExtend;
00449 return ss;
00450 }
00451
00452 struct axtScoreScheme *axtScoreSchemeRnaDefault()
00453
00454
00455 {
00456 static struct axtScoreScheme *ss;
00457 if (ss == NULL)
00458 ss = axtScoreSchemeSimpleDna(100, 200, 300, 300);
00459 return ss;
00460 }
00461
00462 struct axtScoreScheme *axtScoreSchemeRnaFill()
00463
00464
00465 {
00466 static struct axtScoreScheme *ss;
00467 if (ss == NULL)
00468 ss = axtScoreSchemeSimpleDna(100, 100, 200, 200);
00469 return ss;
00470 }
00471
00472 struct axtScoreScheme *axtScoreSchemeFromBlastzMatrix(char *text, int gapOpen, int gapExtend)
00473
00474
00475 {
00476 char *matrixDna[32];
00477 struct axtScoreScheme *ss = axtScoreSchemeDefault();
00478 int matrixSize = chopString(text, ",", matrixDna, 32);
00479 if (matrixSize != 16)
00480 return ss;
00481 if (ss == NULL)
00482 return NULL;
00483 ss->gapOpen = gapOpen;
00484 ss->gapExtend = gapExtend;
00485 ss->matrix['a']['a'] = atoi(matrixDna[0]);
00486 ss->matrix['a']['c'] = atoi(matrixDna[1]);
00487 ss->matrix['a']['g'] = atoi(matrixDna[2]);
00488 ss->matrix['a']['t'] = atoi(matrixDna[3]);
00489
00490 ss->matrix['c']['a'] = atoi(matrixDna[4]);
00491 ss->matrix['c']['c'] = atoi(matrixDna[5]);
00492 ss->matrix['c']['g'] = atoi(matrixDna[6]);
00493 ss->matrix['c']['t'] = atoi(matrixDna[7]);
00494
00495 ss->matrix['g']['a'] = atoi(matrixDna[8]);
00496 ss->matrix['g']['c'] = atoi(matrixDna[9]);
00497 ss->matrix['g']['g'] = atoi(matrixDna[10]);
00498 ss->matrix['g']['t'] = atoi(matrixDna[11]);
00499
00500 ss->matrix['t']['a'] = atoi(matrixDna[12]);
00501 ss->matrix['t']['c'] = atoi(matrixDna[13]);
00502 ss->matrix['t']['g'] = atoi(matrixDna[14]);
00503 ss->matrix['t']['t'] = atoi(matrixDna[15]);
00504 return ss;
00505 }
00506
00507 char blosumText[] = {
00508 "# Matrix made by matblas from blosum62.iij\n"
00509 "# * column uses minimum score\n"
00510 "# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\n"
00511 "# Blocks Database = /data/blocks_5.0/blocks.dat\n"
00512 "# Cluster Percentage: >= 62\n"
00513 "# Entropy = 0.6979, Expected = -0.5209\n"
00514 " A R N D C Q E G H I L K M F P S T W Y V B Z X *\n"
00515 "A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 \n"
00516 "R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 \n"
00517 "N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 \n"
00518 "D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 \n"
00519 "C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 \n"
00520 "Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 \n"
00521 "E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 \n"
00522 "G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 \n"
00523 "H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 \n"
00524 "I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 \n"
00525 "L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 \n"
00526 "K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 \n"
00527 "M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 \n"
00528 "F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 \n"
00529 "P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 \n"
00530 "S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 \n"
00531 "T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 \n"
00532 "W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 \n"
00533 "Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 \n"
00534 "V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 \n"
00535 "B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 \n"
00536 "Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 \n"
00537 "X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 \n"
00538 "* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 \n"
00539 };
00540
00541 static void badProteinMatrixLine(int lineIx, char *fileName)
00542
00543 {
00544 errAbort("Expecting letter and 25 numbers line %d of %s", lineIx, fileName);
00545 }
00546
00547 struct axtScoreScheme *axtScoreSchemeFromProteinText(char *text, char *fileName)
00548
00549
00550 {
00551 char *line, *nextLine;
00552 int lineIx = 0;
00553 int realCount = 0;
00554 char columns[24];
00555 char *row[25];
00556 int i;
00557 struct axtScoreScheme *ss;
00558
00559 AllocVar(ss);
00560 for (line = text; line != NULL; line = nextLine)
00561 {
00562 nextLine = strchr(line, '\n');
00563 if (nextLine != NULL)
00564 *nextLine++ = 0;
00565 ++lineIx;
00566 line = skipLeadingSpaces(line);
00567 if (line[0] == '#' || line[0] == 0)
00568 continue;
00569 ++realCount;
00570 if (realCount == 1)
00571 {
00572 int wordCount = chopLine(line, row);
00573 if (wordCount != 24)
00574 errAbort("Not a good protein matrix - expecting 24 letters line %d of %s", lineIx, fileName);
00575 for (i=0; i<wordCount; ++i)
00576 {
00577 char *letter = row[i];
00578 if (strlen(letter) != 1)
00579 errAbort("Not a good protein matrix - got word not letter line %d of %s", lineIx, fileName);
00580 columns[i] = letter[0];
00581 }
00582 }
00583 else
00584 {
00585 int wordCount = chopLine(line, row);
00586 char letter, lcLetter;
00587 if (wordCount != 25)
00588 badProteinMatrixLine(lineIx, fileName);
00589 letter = row[0][0];
00590 if (strlen(row[0]) != 1 || isdigit(letter))
00591 badProteinMatrixLine(lineIx, fileName);
00592 lcLetter = tolower(letter);
00593 for (i=1; i<wordCount; ++i)
00594 {
00595 char *s = row[i];
00596 int val;
00597 char otherLetter, lcOtherLetter;
00598 if (s[0] == '-') ++s;
00599 if (!isdigit(s[0]))
00600 badProteinMatrixLine(lineIx, fileName);
00601 otherLetter = columns[i-1];
00602 lcOtherLetter = tolower(otherLetter);
00603 val = atoi(row[i]);
00604 ss->matrix[(int)letter][(int)otherLetter] = val;
00605 ss->matrix[(int)lcLetter][(int)otherLetter] = val;
00606 ss->matrix[(int)letter][(int)lcOtherLetter] = val;
00607 ss->matrix[(int)lcLetter][(int)lcOtherLetter] = val;
00608 }
00609 }
00610 }
00611 if (realCount < 25)
00612 errAbort("Unexpected end of %s", fileName);
00613 return ss;
00614 }
00615
00616 struct axtScoreScheme *axtScoreSchemeProteinDefault()
00617
00618
00619 {
00620 static struct axtScoreScheme *ss;
00621 int i,j;
00622 if (ss != NULL)
00623 return ss;
00624 ss = axtScoreSchemeFromProteinText(blosumText, "blosum62");
00625 for (i=0; i<128; ++i)
00626 for (j=0; j<128; ++j)
00627 ss->matrix[i][j] *= 19;
00628 ss->gapOpen = 11 * 19;
00629 ss->gapExtend = 1 * 19;
00630 return ss;
00631 }
00632
00633 void axtScoreSchemeFree(struct axtScoreScheme **pObj)
00634
00635 {
00636 freez(pObj);
00637 }
00638
00639 struct axtScoreScheme *axtScoreSchemeProteinRead(char *fileName)
00640 {
00641 char *string;
00642 struct axtScoreScheme *ss;
00643
00644 readInGulp(fileName, &string, NULL);
00645 ss = axtScoreSchemeFromProteinText(string, fileName);
00646 freeMem(string);
00647
00648 return ss;
00649 }
00650
00651 struct axtScoreScheme *axtScoreSchemeReadLf(struct lineFile *lf )
00652
00653
00654
00655
00656
00657
00658
00659
00660 {
00661 char *line, *row[4], *parts[32];
00662 int i,j, partCount;
00663 struct axtScoreScheme *ss;
00664 boolean gotO = FALSE, gotE = FALSE;
00665 static int trans[4] = {'a', 'c', 'g', 't'};
00666
00667 AllocVar(ss);
00668 ss->extra = NULL;
00669 if (!lineFileRow(lf, row))
00670 shortScoreScheme(lf);
00671 if (row[0][0] != 'A' || row[1][0] != 'C' || row[2][0] != 'G'
00672 || row[3][0] != 'T')
00673 errAbort("%s doesn't seem to be a score matrix file", lf->fileName);
00674 for (i=0; i<4; ++i)
00675 {
00676 if (!lineFileRow(lf, row))
00677 shortScoreScheme(lf);
00678 for (j=0; j<4; ++j)
00679 ss->matrix[trans[i]][trans[j]] = lineFileNeedNum(lf, row, j);
00680 }
00681 if (lineFileNext(lf, &line, NULL))
00682 {
00683 ss->extra = cloneString(line);
00684 partCount = chopString(line, " =,\t", parts, ArraySize(parts));
00685 for (i=0; i<partCount-1; i += 2)
00686 {
00687 if (sameString(parts[i], "O"))
00688 {
00689 gotO = TRUE;
00690 ss->gapOpen = atoi(parts[i+1]);
00691 }
00692 if (sameString(parts[i], "E"))
00693 {
00694 gotE = TRUE;
00695 ss->gapExtend = atoi(parts[i+1]);
00696 }
00697 }
00698 if (!gotO || !gotE)
00699 errAbort("Expecting O = and E = in last line of %s", lf->fileName);
00700 if (ss->gapOpen <= 0 || ss->gapExtend <= 0)
00701 errAbort("Must have positive gap scores");
00702 }
00703 else
00704 {
00705 ss->gapOpen = 400;
00706 ss->gapExtend = 30;
00707 }
00708 propagateCase(ss);
00709 return ss;
00710 }
00711
00712 struct axtScoreScheme *axtScoreSchemeRead(char *fileName)
00713
00714
00715
00716
00717
00718
00719
00720
00721 {
00722 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00723 struct axtScoreScheme *ss = axtScoreSchemeReadLf(lf);
00724 return ss;
00725 }
00726
00727 void axtScoreSchemeDnaWrite(struct axtScoreScheme *ss, FILE *f, char *name)
00728
00729
00730 {
00731 if (ss == NULL)
00732 return;
00733 if (f == NULL)
00734 return;
00735 fprintf(f, "##matrix=%s 16 %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
00736 name,
00737 ss->matrix['a']['a'],
00738 ss->matrix['a']['c'],
00739 ss->matrix['a']['g'],
00740 ss->matrix['a']['t'],
00741
00742 ss->matrix['c']['a'],
00743 ss->matrix['c']['c'],
00744 ss->matrix['c']['g'],
00745 ss->matrix['c']['t'],
00746
00747 ss->matrix['g']['a'],
00748 ss->matrix['g']['c'],
00749 ss->matrix['g']['g'],
00750 ss->matrix['g']['t'],
00751
00752 ss->matrix['t']['a'],
00753 ss->matrix['t']['c'],
00754 ss->matrix['t']['g'],
00755 ss->matrix['t']['t']);
00756 fprintf(f, "##gapPenalties=%s O=%d E=%d\n", name, ss->gapOpen, ss->gapExtend);
00757 if (ss->extra!=NULL)
00758 {
00759 stripChar(ss->extra,' ');
00760 stripChar(ss->extra,'"');
00761 fprintf(f, "##blastzParms=%s\n", ss->extra);
00762 }
00763 }
00764
00765 void axtSwap(struct axt *axt, int tSize, int qSize)
00766
00767 {
00768 struct axt old = *axt;
00769
00770
00771 axt->qName = old.tName;
00772 axt->tName = old.qName;
00773 axt->qSym = old.tSym;
00774 axt->tSym = old.qSym;
00775 axt->qStart = old.tStart;
00776 axt->qEnd = old.tEnd;
00777 axt->tStart = old.qStart;
00778 axt->tEnd = old.qEnd;
00779
00780
00781 assert(axt->tStrand != '-');
00782
00783 if (axt->qStrand == '-')
00784 {
00785
00786
00787
00788
00789 reverseIntRange(&axt->tStart, &axt->tEnd, qSize);
00790 reverseIntRange(&axt->qStart, &axt->qEnd, tSize);
00791 reverseComplement(axt->qSym, axt->symCount);
00792 reverseComplement(axt->tSym, axt->symCount);
00793 }
00794 }
00795
00796 void axtBundleFree(struct axtBundle **pObj)
00797
00798 {
00799 struct axtBundle *obj = *pObj;
00800 if (obj != NULL)
00801 {
00802 axtFreeList(&obj->axtList);
00803 freez(pObj);
00804 }
00805 }
00806
00807 void axtBundleFreeList(struct axtBundle **pList)
00808
00809 {
00810 struct axtBundle *el, *next;
00811
00812 for (el = *pList; el != NULL; el = next)
00813 {
00814 next = el->next;
00815 axtBundleFree(&el);
00816 }
00817 *pList = NULL;
00818 }
00819
00820 void axtAddBlocksToBoxInList(struct cBlock **pList, struct axt *axt)
00821
00822
00823 {
00824 boolean thisIn, lastIn = FALSE;
00825 int qPos = axt->qStart, tPos = axt->tStart;
00826 int qStart = 0, tStart = 0;
00827 int i;
00828
00829 for (i=0; i<=axt->symCount; ++i)
00830 {
00831 int advanceQ = (isalpha(axt->qSym[i]) ? 1 : 0);
00832 int advanceT = (isalpha(axt->tSym[i]) ? 1 : 0);
00833 thisIn = (advanceQ && advanceT);
00834 if (thisIn)
00835 {
00836 if (!lastIn)
00837 {
00838 qStart = qPos;
00839 tStart = tPos;
00840 }
00841 }
00842 else
00843 {
00844 if (lastIn)
00845 {
00846 int size = qPos - qStart;
00847 assert(size == tPos - tStart);
00848 if (size > 0)
00849 {
00850 struct cBlock *b;
00851 AllocVar(b);
00852 b->qStart = qStart;
00853 b->qEnd = qPos;
00854 b->tStart = tStart;
00855 b->tEnd = tPos;
00856 slAddHead(pList, b);
00857 }
00858 }
00859 }
00860 lastIn = thisIn;
00861 qPos += advanceQ;
00862 tPos += advanceT;
00863 }
00864 }
00865
00866 void axtPrintTraditional(struct axt *axt, int maxLine, struct axtScoreScheme *ss, FILE *f)
00867
00868 {
00869 int qPos = axt->qStart;
00870 int tPos = axt->tStart;
00871 int symPos;
00872 int aDigits = digitsBaseTen(axt->qEnd);
00873 int bDigits = digitsBaseTen(axt->tEnd);
00874 int digits = max(aDigits, bDigits);
00875
00876 for (symPos = 0; symPos < axt->symCount; symPos += maxLine)
00877 {
00878
00879 int lineSize = axt->symCount - symPos;
00880 int lineEnd, i;
00881 if (lineSize > maxLine)
00882 lineSize = maxLine;
00883 lineEnd = symPos + lineSize;
00884
00885
00886 fprintf(f, "%0*d ", digits, qPos+1);
00887 for (i=symPos; i<lineEnd; ++i)
00888 {
00889 char c = axt->qSym[i];
00890 fputc(c, f);
00891 if (c != '.' && c != '-')
00892 ++qPos;
00893 }
00894 fprintf(f, " %0*d\n", digits, qPos);
00895
00896
00897 spaceOut(f, digits+1);
00898 for (i=symPos; i<lineEnd; ++i)
00899 {
00900 char q = axt->qSym[i];
00901 char t = axt->tSym[i];
00902 char out = ' ';
00903 if (q == t)
00904 out = '|';
00905 else if (ss != NULL && ss->matrix[(int)q][(int)t] > 0)
00906 out = '+';
00907 fputc(out, f);
00908 }
00909 fputc('\n', f);
00910
00911
00912 fprintf(f, "%0*d ", digits, tPos+1);
00913 for (i=symPos; i<lineEnd; ++i)
00914 {
00915 char c = axt->tSym[i];
00916 fputc(c, f);
00917 if (c != '.' && c != '-')
00918 ++tPos;
00919 }
00920 fprintf(f, " %0*d\n", digits, tPos);
00921
00922
00923 fputc('\n', f);
00924 }
00925 }
00926
00927 double axtIdWithGaps(struct axt *axt)
00928
00929 {
00930 int i;
00931 int matchCount = 0;
00932 for (i=0; i<axt->symCount; ++i)
00933 {
00934 if (toupper(axt->qSym[i]) == toupper(axt->tSym[i]))
00935 ++matchCount;
00936 }
00937 return (double)matchCount/axt->symCount;
00938 }
00939
00940 double axtCoverage(struct axt *axt, int qSize, int tSize)
00941
00942 {
00943 double cov = axt->tEnd - axt->tStart + axt->qEnd - axt->qStart;
00944 return cov/(qSize+tSize);
00945 }
00946