00001
00002
00003
00004
00005 #include "common.h"
00006 #include "linefile.h"
00007 #include "sqlList.h"
00008 #include "dystring.h"
00009 #include "dnaMotif.h"
00010 #include "portable.h"
00011
00012 static char const rcsid[] = "$Id: dnaMotif.c,v 1.4 2006/09/14 21:08:24 braney Exp $";
00013
00014 struct dnaMotif *dnaMotifCommaIn(char **pS, struct dnaMotif *ret)
00015
00016
00017
00018 {
00019 char *s = *pS;
00020 int i;
00021
00022 if (ret == NULL)
00023 AllocVar(ret);
00024 ret->name = sqlStringComma(&s);
00025 ret->columnCount = sqlSignedComma(&s);
00026 s = sqlEatChar(s, '{');
00027 AllocArray(ret->aProb, ret->columnCount);
00028 for (i=0; i<ret->columnCount; ++i)
00029 {
00030 ret->aProb[i] = sqlSignedComma(&s);
00031 }
00032 s = sqlEatChar(s, '}');
00033 s = sqlEatChar(s, ',');
00034 s = sqlEatChar(s, '{');
00035 AllocArray(ret->cProb, ret->columnCount);
00036 for (i=0; i<ret->columnCount; ++i)
00037 {
00038 ret->cProb[i] = sqlSignedComma(&s);
00039 }
00040 s = sqlEatChar(s, '}');
00041 s = sqlEatChar(s, ',');
00042 s = sqlEatChar(s, '{');
00043 AllocArray(ret->gProb, ret->columnCount);
00044 for (i=0; i<ret->columnCount; ++i)
00045 {
00046 ret->gProb[i] = sqlSignedComma(&s);
00047 }
00048 s = sqlEatChar(s, '}');
00049 s = sqlEatChar(s, ',');
00050 s = sqlEatChar(s, '{');
00051 AllocArray(ret->tProb, ret->columnCount);
00052 for (i=0; i<ret->columnCount; ++i)
00053 {
00054 ret->tProb[i] = sqlSignedComma(&s);
00055 }
00056 s = sqlEatChar(s, '}');
00057 s = sqlEatChar(s, ',');
00058 *pS = s;
00059 return ret;
00060 }
00061
00062 void dnaMotifFree(struct dnaMotif **pEl)
00063
00064
00065 {
00066 struct dnaMotif *el;
00067
00068 if ((el = *pEl) == NULL) return;
00069 freeMem(el->name);
00070 freeMem(el->aProb);
00071 freeMem(el->cProb);
00072 freeMem(el->gProb);
00073 freeMem(el->tProb);
00074 freez(pEl);
00075 }
00076
00077 void dnaMotifFreeList(struct dnaMotif **pList)
00078
00079 {
00080 struct dnaMotif *el, *next;
00081
00082 for (el = *pList; el != NULL; el = next)
00083 {
00084 next = el->next;
00085 dnaMotifFree(&el);
00086 }
00087 *pList = NULL;
00088 }
00089
00090 void dnaMotifOutput(struct dnaMotif *el, FILE *f, char sep, char lastSep)
00091
00092 {
00093 int i;
00094 if (sep == ',') fputc('"',f);
00095 fprintf(f, "%s", el->name);
00096 if (sep == ',') fputc('"',f);
00097 fputc(sep,f);
00098 fprintf(f, "%d", el->columnCount);
00099 fputc(sep,f);
00100 if (sep == ',') fputc('{',f);
00101 for (i=0; i<el->columnCount; ++i)
00102 {
00103 fprintf(f, "%f", el->aProb[i]);
00104 fputc(',', f);
00105 }
00106 if (sep == ',') fputc('}',f);
00107 fputc(sep,f);
00108 if (sep == ',') fputc('{',f);
00109 for (i=0; i<el->columnCount; ++i)
00110 {
00111 fprintf(f, "%f", el->cProb[i]);
00112 fputc(',', f);
00113 }
00114 if (sep == ',') fputc('}',f);
00115 fputc(sep,f);
00116 if (sep == ',') fputc('{',f);
00117 for (i=0; i<el->columnCount; ++i)
00118 {
00119 fprintf(f, "%f", el->gProb[i]);
00120 fputc(',', f);
00121 }
00122 if (sep == ',') fputc('}',f);
00123 fputc(sep,f);
00124 if (sep == ',') fputc('{',f);
00125 for (i=0; i<el->columnCount; ++i)
00126 {
00127 fprintf(f, "%f", el->tProb[i]);
00128 fputc(',', f);
00129 }
00130 if (sep == ',') fputc('}',f);
00131 fputc(lastSep,f);
00132 }
00133
00134 float dnaMotifSequenceProb(struct dnaMotif *motif, DNA *dna)
00135
00136
00137
00138 {
00139 float p = 1.0;
00140 int i;
00141 for (i=0; i<motif->columnCount; ++i)
00142 {
00143 switch (dna[i])
00144 {
00145 case 'a':
00146 case 'A':
00147 p *= motif->aProb[i];
00148 break;
00149 case 'c':
00150 case 'C':
00151 p *= motif->cProb[i];
00152 break;
00153 case 'g':
00154 case 'G':
00155 p *= motif->gProb[i];
00156 break;
00157 case 't':
00158 case 'T':
00159 p *= motif->tProb[i];
00160 break;
00161 case 0:
00162 warn("dna shorter than motif");
00163 internalErr();
00164 break;
00165 default:
00166 p *= 0.25;
00167 break;
00168 }
00169 }
00170 return p;
00171 }
00172
00173 char dnaMotifBestStrand(struct dnaMotif *motif, DNA *dna)
00174
00175 {
00176 float fScore, rScore;
00177 fScore = dnaMotifSequenceProb(motif, dna);
00178 reverseComplement(dna, motif->columnCount);
00179 rScore = dnaMotifSequenceProb(motif, dna);
00180 reverseComplement(dna, motif->columnCount);
00181 if (fScore >= rScore)
00182 return '+';
00183 else
00184 return '-';
00185 }
00186
00187 double dnaMotifBitScore(struct dnaMotif *motif, DNA *dna)
00188
00189 {
00190 double p = dnaMotifSequenceProb(motif, dna);
00191 double q = pow(0.25, motif->columnCount);
00192 double odds = p/q;
00193 return logBase2(odds);
00194 }
00195
00196
00197 void dnaMotifNormalize(struct dnaMotif *motif)
00198
00199 {
00200 int i;
00201 for (i=0; i<motif->columnCount; ++i)
00202 {
00203 float sum = motif->aProb[i] + motif->cProb[i] + motif->gProb[i] + motif->tProb[i];
00204 if (sum < 0)
00205 errAbort("%s has negative numbers, perhaps it's score not probability based",
00206 motif->name);
00207 if (sum == 0)
00208 motif->aProb[i] = motif->cProb[i] = motif->gProb[i] = motif->tProb[i] = 0.25;
00209 motif->aProb[i] /= sum;
00210 motif->cProb[i] /= sum;
00211 motif->gProb[i] /= sum;
00212 motif->tProb[i] /= sum;
00213 }
00214 }
00215
00216 boolean dnaMotifIsScoreBased(struct dnaMotif *motif)
00217
00218
00219 {
00220 int i;
00221 for (i=0; i<motif->columnCount; ++i)
00222 {
00223 if (motif->aProb[i] < 0) return TRUE;
00224 if (motif->cProb[i] < 0) return TRUE;
00225 if (motif->gProb[i] < 0) return TRUE;
00226 if (motif->tProb[i] < 0) return TRUE;
00227 }
00228 return FALSE;
00229 }
00230
00231 void dnaMotifScoreToProb(struct dnaMotif *motif)
00232
00233
00234
00235 {
00236 int i;
00237 for (i=0; i<motif->columnCount; ++i)
00238 {
00239 motif->aProb[i] = exp(motif->aProb[i]);
00240 motif->cProb[i] = exp(motif->cProb[i]);
00241 motif->gProb[i] = exp(motif->gProb[i]);
00242 motif->tProb[i] = exp(motif->tProb[i]);
00243 }
00244 dnaMotifNormalize(motif);
00245 }
00246
00247 void dnaMotifMakeProbabalistic(struct dnaMotif *motif)
00248
00249
00250 {
00251 if (dnaMotifIsScoreBased(motif))
00252 dnaMotifScoreToProb(motif);
00253 else
00254 dnaMotifNormalize(motif);
00255 }
00256
00257 static void printProbRow(FILE *f, char *label, float *p, int pCount)
00258
00259 {
00260 int i;
00261 fprintf(f, "%s ", label);
00262 for (i=0; i < pCount; ++i)
00263 fprintf(f, "%5.2f ", p[i]);
00264 printf("\n");
00265 }
00266
00267 void dnaMotifPrintProb(struct dnaMotif *motif, FILE *f)
00268
00269 {
00270 printProbRow(f, "A", motif->aProb, motif->columnCount);
00271 printProbRow(f, "C", motif->cProb, motif->columnCount);
00272 printProbRow(f, "G", motif->gProb, motif->columnCount);
00273 printProbRow(f, "T", motif->tProb, motif->columnCount);
00274 }
00275
00276
00277 static double u1(double prob)
00278
00279 {
00280 if (prob == 0)
00281 return 0;
00282 return prob * logBase2(prob);
00283 }
00284
00285 static double uncertainty(struct dnaMotif *motif, int pos)
00286
00287
00288 {
00289 return -( u1(motif->aProb[pos]) + u1(motif->cProb[pos])
00290 + u1(motif->gProb[pos]) +u1(motif->tProb[pos]) );
00291 }
00292
00293 double dnaMotifBitsOfInfo(struct dnaMotif *motif, int pos)
00294
00295 {
00296 if (pos > motif->columnCount || pos < 0)
00297 internalErr();
00298 return 2 - uncertainty(motif, pos);
00299 }
00300
00301 struct letterProb
00302
00303 {
00304 struct letterProb *next;
00305 double prob;
00306 char letter;
00307 };
00308
00309 static struct letterProb *letterProbNew(char letter, double prob)
00310
00311 {
00312 struct letterProb *lp;
00313 AllocVar(lp);
00314 lp->letter = letter;
00315 lp->prob = prob;
00316 return lp;
00317 }
00318
00319 static int letterProbCmp(const void *va, const void *vb)
00320
00321 {
00322 const struct letterProb *a = *((struct letterProb **)va);
00323 const struct letterProb *b = *((struct letterProb **)vb);
00324 double dif = a->prob - b->prob;
00325 if (dif < 0)
00326 return -1;
00327 else if (dif > 0)
00328 return 1;
00329 else
00330 return 0;
00331 }
00332
00333 static void addBaseProb(struct letterProb **pList, char letter, double prob)
00334
00335 {
00336 if (prob > 0)
00337 {
00338 struct letterProb *lp = letterProbNew(letter, prob);
00339 slAddHead(pList, lp);
00340 }
00341 }
00342
00343 static struct letterProb *letterProbFromMotifColumn(struct dnaMotif *motif, int pos)
00344
00345 {
00346 struct letterProb *lpList = NULL;
00347 addBaseProb(&lpList, 'A', motif->aProb[pos]);
00348 addBaseProb(&lpList, 'C', motif->cProb[pos]);
00349 addBaseProb(&lpList, 'G', motif->gProb[pos]);
00350 addBaseProb(&lpList, 'T', motif->tProb[pos]);
00351 slSort(&lpList, letterProbCmp);
00352 return lpList;
00353 }
00354
00355 static void psOneColumn(struct dnaMotif *motif, int pos,
00356 double xStart, double yStart, double width, double totalHeight,
00357 FILE *f)
00358
00359 {
00360 struct letterProb *lp, *lpList = letterProbFromMotifColumn(motif, pos);
00361 double x = xStart, y = yStart, w = width, h;
00362 for (lp = lpList; lp != NULL; lp = lp->next)
00363 {
00364 h = totalHeight * lp->prob;
00365 if (h >= 1.0)
00366 {
00367 fprintf(f, "%cColor ", tolower(lp->letter));
00368 fprintf(f, "%3.2f ", x);
00369 fprintf(f, "%3.2f ", y);
00370 fprintf(f, "%3.2f ", x + w);
00371 fprintf(f, "%3.2f ", y + h);
00372 fprintf(f, "(%c) textInBox\n", lp->letter);
00373 }
00374 y += h;
00375 }
00376 fprintf(f, "\n");
00377 slFreeList(&lpList);
00378 }
00379
00380 static void dnaMotifDims(struct dnaMotif *motif, double widthPerBase, double height,
00381 int *retWidth, int *retHeight)
00382
00383 {
00384 static int widthFudgeFactor = 2, heightFudgeFactor = 2;
00385 *retWidth = ceil(widthPerBase * motif->columnCount) + widthFudgeFactor;
00386 *retHeight = ceil(height) + heightFudgeFactor;
00387 }
00388
00389 void dnaMotifToLogoPs(struct dnaMotif *motif, double widthPerBase, double height, char *fileName)
00390
00391 {
00392 FILE *f = mustOpen(fileName, "w");
00393 int i;
00394 int xStart = 0;
00395 int w, h;
00396 char *s =
00397 #include "dnaMotif.pss"
00398 ;
00399
00400 dnaMotifDims(motif, widthPerBase, height, &w, &h);
00401 fprintf(f, "%%!PS-Adobe-3.1 EPSF-3.0\n");
00402 fprintf(f, "%%%%BoundingBox: 0 0 %d %d\n\n", w, h);
00403 fprintf(f, "%s", s);
00404
00405 fprintf(f, "%s", "% Start of code for this specific logo\n");
00406
00407 for (i=0; i<motif->columnCount; ++i)
00408 {
00409 double infoScale = dnaMotifBitsOfInfo(motif, i)/2.0;
00410 psOneColumn(motif, i, xStart, 0, widthPerBase, infoScale * height, f);
00411 xStart += widthPerBase;
00412 }
00413 fprintf(f, "showpage\n");
00414 carefulClose(&f);
00415 }
00416
00417 void dnaMotifToLogoPng(
00418 struct dnaMotif *motif,
00419 double widthPerBase,
00420 double height,
00421 char *gsExe,
00422 char *tempDir,
00423 char *fileName)
00424
00425 {
00426 char *psName = rTempName(tempDir, "dnaMotif", ".ps");
00427 struct dyString *dy = dyStringNew(0);
00428 int w, h;
00429 int sysRet;
00430
00431 if (gsExe == NULL) gsExe = "gs";
00432 if (tempDir == NULL) tempDir = "/tmp";
00433 dnaMotifToLogoPs(motif, widthPerBase, height, psName);
00434 dnaMotifDims(motif, widthPerBase, height, &w, &h);
00435 dyStringAppend(dy, gsExe);
00436 dyStringAppend(dy, " -sDEVICE=png16m -sOutputFile=");
00437 dyStringAppend(dy, fileName);
00438 dyStringAppend(dy, " -dBATCH -dNOPAUSE -q ");
00439 dyStringPrintf(dy, "-g%dx%d ", w, h);
00440 dyStringAppend(dy, psName);
00441 sysRet = system(dy->string);
00442 if (sysRet != 0)
00443 errAbort("System call returned %d for:\n %s", sysRet, dy->string);
00444
00445
00446 dyStringFree(&dy);
00447
00448
00449 dy = newDyString(0);
00450 dyStringPrintf(dy, "chmod 666 %s ", fileName);
00451 sysRet = system(dy->string);
00452
00453 remove(psName);
00454 }
00455
00456 void dnaMotifToLogoPsW(struct dnaMotif *motif, double widthPerBase, double width, double height, char *fileName)
00457
00458 {
00459 FILE *f = mustOpen(fileName, "w");
00460 int i;
00461 int xStart = 0;
00462 int w, h;
00463 char *s =
00464 #include "dnaMotif.pss"
00465 ;
00466
00467 dnaMotifDims(motif, widthPerBase, height, &w, &h);
00468 fprintf(f, "%%!PS-Adobe-3.1 EPSF-3.0\n");
00469 fprintf(f, "%%%%BoundingBox: 0 0 %d %d\n\n", w, h);
00470 fprintf(f, "%s", s);
00471
00472 fprintf(f, "%s", "% Start of code for this specific logo\n");
00473
00474 for (i=0; i<motif->columnCount; ++i)
00475 {
00476 double infoScale = 0.9 ;
00477 xStart = i * width / motif->columnCount;
00478 psOneColumn(motif, i, xStart, 0, widthPerBase, infoScale * height, f);
00479 }
00480 fprintf(f, "showpage\n");
00481 carefulClose(&f);
00482 }
00483
00484 void dnaMotifToLogoPGM(
00485 struct dnaMotif *motif,
00486 double widthPerBase,
00487 double width,
00488 double height,
00489 char *gsExe,
00490 char *tempDir,
00491 char *fileName)
00492
00493 {
00494 char *psName = rTempName(tempDir, "dnaMotif", ".ps");
00495 struct dyString *dy = dyStringNew(0);
00496 int w, h;
00497 int sysRet;
00498
00499 if (gsExe == NULL) gsExe = "gs";
00500 if (tempDir == NULL) tempDir = "/tmp";
00501 dnaMotifToLogoPsW(motif, widthPerBase, width, height, psName);
00502 dnaMotifDims(motif, widthPerBase, height, &w, &h);
00503 dyStringAppend(dy, gsExe);
00504 dyStringAppend(dy, " -sDEVICE=pgmraw -sOutputFile=");
00505 dyStringAppend(dy, fileName);
00506 dyStringAppend(dy, " -dBATCH -dNOPAUSE -q ");
00507 dyStringPrintf(dy, "-g%dx%d ", (int) ceil(width), h);
00508 dyStringAppend(dy, psName);
00509 sysRet = system(dy->string);
00510 if (sysRet != 0)
00511 errAbort("System call returned %d for:\n %s", sysRet, dy->string);
00512
00513
00514 dyStringFree(&dy);
00515
00516
00517 dy = newDyString(0);
00518 dyStringPrintf(dy, "chmod 666 %s ", fileName);
00519 sysRet = system(dy->string);
00520
00521 remove(psName);
00522 }