lib/dnaMotif.c

Go to the documentation of this file.
00001 /* dnaMotif.c was originally generated by the autoSql program, which also 
00002  * generated dnaMotif.h and dnaMotif.sql.  This module links the database and
00003  * the RAM representation of objects. */
00004 
00005 #include "common.h"
00006 #include "linefile.h"
00007 #include "sqlList.h"
00008 #include "dystring.h"
00009 #include "dnaMotif.h"
00010 #include "portable.h"
00011 
00012 static char const rcsid[] = "$Id: dnaMotif.c,v 1.4 2006/09/14 21:08:24 braney Exp $";
00013 
00014 struct dnaMotif *dnaMotifCommaIn(char **pS, struct dnaMotif *ret)
00015 /* Create a dnaMotif out of a comma separated string. 
00016  * This will fill in ret if non-null, otherwise will
00017  * return a new dnaMotif */
00018 {
00019 char *s = *pS;
00020 int i;
00021 
00022 if (ret == NULL)
00023     AllocVar(ret);
00024 ret->name = sqlStringComma(&s);
00025 ret->columnCount = sqlSignedComma(&s);
00026 s = sqlEatChar(s, '{');
00027 AllocArray(ret->aProb, ret->columnCount);
00028 for (i=0; i<ret->columnCount; ++i)
00029     {
00030     ret->aProb[i] = sqlSignedComma(&s);
00031     }
00032 s = sqlEatChar(s, '}');
00033 s = sqlEatChar(s, ',');
00034 s = sqlEatChar(s, '{');
00035 AllocArray(ret->cProb, ret->columnCount);
00036 for (i=0; i<ret->columnCount; ++i)
00037     {
00038     ret->cProb[i] = sqlSignedComma(&s);
00039     }
00040 s = sqlEatChar(s, '}');
00041 s = sqlEatChar(s, ',');
00042 s = sqlEatChar(s, '{');
00043 AllocArray(ret->gProb, ret->columnCount);
00044 for (i=0; i<ret->columnCount; ++i)
00045     {
00046     ret->gProb[i] = sqlSignedComma(&s);
00047     }
00048 s = sqlEatChar(s, '}');
00049 s = sqlEatChar(s, ',');
00050 s = sqlEatChar(s, '{');
00051 AllocArray(ret->tProb, ret->columnCount);
00052 for (i=0; i<ret->columnCount; ++i)
00053     {
00054     ret->tProb[i] = sqlSignedComma(&s);
00055     }
00056 s = sqlEatChar(s, '}');
00057 s = sqlEatChar(s, ',');
00058 *pS = s;
00059 return ret;
00060 }
00061 
00062 void dnaMotifFree(struct dnaMotif **pEl)
00063 /* Free a single dynamically allocated dnaMotif such as created
00064  * with dnaMotifLoad(). */
00065 {
00066 struct dnaMotif *el;
00067 
00068 if ((el = *pEl) == NULL) return;
00069 freeMem(el->name);
00070 freeMem(el->aProb);
00071 freeMem(el->cProb);
00072 freeMem(el->gProb);
00073 freeMem(el->tProb);
00074 freez(pEl);
00075 }
00076 
00077 void dnaMotifFreeList(struct dnaMotif **pList)
00078 /* Free a list of dynamically allocated dnaMotif's */
00079 {
00080 struct dnaMotif *el, *next;
00081 
00082 for (el = *pList; el != NULL; el = next)
00083     {
00084     next = el->next;
00085     dnaMotifFree(&el);
00086     }
00087 *pList = NULL;
00088 }
00089 
00090 void dnaMotifOutput(struct dnaMotif *el, FILE *f, char sep, char lastSep) 
00091 /* Print out dnaMotif.  Separate fields with sep. Follow last field with lastSep. */
00092 {
00093 int i;
00094 if (sep == ',') fputc('"',f);
00095 fprintf(f, "%s", el->name);
00096 if (sep == ',') fputc('"',f);
00097 fputc(sep,f);
00098 fprintf(f, "%d", el->columnCount);
00099 fputc(sep,f);
00100 if (sep == ',') fputc('{',f);
00101 for (i=0; i<el->columnCount; ++i)
00102     {
00103     fprintf(f, "%f", el->aProb[i]);
00104     fputc(',', f);
00105     }
00106 if (sep == ',') fputc('}',f);
00107 fputc(sep,f);
00108 if (sep == ',') fputc('{',f);
00109 for (i=0; i<el->columnCount; ++i)
00110     {
00111     fprintf(f, "%f", el->cProb[i]);
00112     fputc(',', f);
00113     }
00114 if (sep == ',') fputc('}',f);
00115 fputc(sep,f);
00116 if (sep == ',') fputc('{',f);
00117 for (i=0; i<el->columnCount; ++i)
00118     {
00119     fprintf(f, "%f", el->gProb[i]);
00120     fputc(',', f);
00121     }
00122 if (sep == ',') fputc('}',f);
00123 fputc(sep,f);
00124 if (sep == ',') fputc('{',f);
00125 for (i=0; i<el->columnCount; ++i)
00126     {
00127     fprintf(f, "%f", el->tProb[i]);
00128     fputc(',', f);
00129     }
00130 if (sep == ',') fputc('}',f);
00131 fputc(lastSep,f);
00132 }
00133 
00134 float dnaMotifSequenceProb(struct dnaMotif *motif, DNA *dna)
00135 /* Return probability of dna according to motif.  Make sure
00136  * motif is probabalistic (with call to dnaMotifMakeProbabalistic
00137  * if you're not sure) before calling this. */
00138 {
00139 float p = 1.0;
00140 int i;
00141 for (i=0; i<motif->columnCount; ++i)
00142     {
00143     switch (dna[i])
00144         {
00145         case 'a':
00146         case 'A':
00147             p *= motif->aProb[i];
00148             break;
00149         case 'c':
00150         case 'C':
00151             p *= motif->cProb[i];
00152             break;
00153         case 'g':
00154         case 'G':
00155             p *= motif->gProb[i];
00156             break;
00157         case 't':
00158         case 'T':
00159             p *= motif->tProb[i];
00160             break;
00161         case 0:
00162             warn("dna shorter than motif");
00163             internalErr();
00164             break;
00165         default:
00166             p *= 0.25;
00167             break;
00168         }
00169     }
00170 return p;
00171 }
00172 
00173 char dnaMotifBestStrand(struct dnaMotif *motif, DNA *dna)
00174 /* Figure out which strand of DNA is better for probabalistic motif. */
00175 {
00176 float fScore, rScore;
00177 fScore = dnaMotifSequenceProb(motif, dna);
00178 reverseComplement(dna, motif->columnCount);
00179 rScore = dnaMotifSequenceProb(motif, dna);
00180 reverseComplement(dna, motif->columnCount);
00181 if (fScore >= rScore)
00182     return '+';
00183 else
00184     return '-';
00185 }
00186 
00187 double dnaMotifBitScore(struct dnaMotif *motif, DNA *dna)
00188 /* Return logBase2-odds score of dna given a probabalistic motif. */
00189 {
00190 double p = dnaMotifSequenceProb(motif, dna);
00191 double q = pow(0.25, motif->columnCount);
00192 double odds = p/q;
00193 return logBase2(odds);
00194 }
00195 
00196 
00197 void dnaMotifNormalize(struct dnaMotif *motif)
00198 /* Make all columns of motif sum to one. */
00199 {
00200 int i;
00201 for (i=0; i<motif->columnCount; ++i)
00202     {
00203     float sum = motif->aProb[i] + motif->cProb[i] + motif->gProb[i] + motif->tProb[i];
00204     if (sum < 0)
00205         errAbort("%s has negative numbers, perhaps it's score not probability based", 
00206                 motif->name);
00207     if (sum == 0)
00208          motif->aProb[i] = motif->cProb[i] = motif->gProb[i] = motif->tProb[i] = 0.25;
00209     motif->aProb[i] /= sum;
00210     motif->cProb[i] /= sum;
00211     motif->gProb[i] /= sum;
00212     motif->tProb[i] /= sum;
00213     }
00214 }
00215 
00216 boolean dnaMotifIsScoreBased(struct dnaMotif *motif)
00217 /* Return TRUE if dnaMotif is score-based (which we decide by
00218  * the presense of negative values. */
00219 {
00220 int i;
00221 for (i=0; i<motif->columnCount; ++i)
00222     {
00223     if (motif->aProb[i] < 0) return TRUE;
00224     if (motif->cProb[i] < 0) return TRUE;
00225     if (motif->gProb[i] < 0) return TRUE;
00226     if (motif->tProb[i] < 0) return TRUE;
00227     }
00228 return FALSE;
00229 }
00230 
00231 void dnaMotifScoreToProb(struct dnaMotif *motif)
00232 /* Convert motif that is log-odds score based to motif
00233  * that is probability based.  This assumes that the
00234  * background distribution is simple: 25% for each base */
00235 {
00236 int i;
00237 for (i=0; i<motif->columnCount; ++i)
00238     {
00239     motif->aProb[i] = exp(motif->aProb[i]);
00240     motif->cProb[i] = exp(motif->cProb[i]);
00241     motif->gProb[i] = exp(motif->gProb[i]);
00242     motif->tProb[i] = exp(motif->tProb[i]);
00243     }
00244 dnaMotifNormalize(motif);
00245 }
00246 
00247 void dnaMotifMakeProbabalistic(struct dnaMotif *motif)
00248 /* Change motif, which may be score or count based, to 
00249  * probabalistic one, where each column adds to 1.0 */
00250 {
00251 if (dnaMotifIsScoreBased(motif))
00252     dnaMotifScoreToProb(motif);
00253 else
00254     dnaMotifNormalize(motif);
00255 }
00256 
00257 static void printProbRow(FILE *f, char *label, float *p, int pCount)
00258 /* Print one row of a probability profile. */
00259 {
00260 int i;
00261 fprintf(f, "%s ", label);
00262 for (i=0; i < pCount; ++i)
00263     fprintf(f, "%5.2f ", p[i]);
00264 printf("\n");
00265 }
00266 
00267 void dnaMotifPrintProb(struct dnaMotif *motif, FILE *f)
00268 /* Print DNA motif probabilities. */
00269 {
00270 printProbRow(f, "A", motif->aProb, motif->columnCount);
00271 printProbRow(f, "C", motif->cProb, motif->columnCount);
00272 printProbRow(f, "G", motif->gProb, motif->columnCount);
00273 printProbRow(f, "T", motif->tProb, motif->columnCount);
00274 }
00275 
00276 
00277 static double u1(double prob)
00278 /* Calculate partial uncertainty for one base. */
00279 {
00280 if (prob == 0)
00281     return 0;
00282 return prob * logBase2(prob);
00283 }
00284 
00285 static double uncertainty(struct dnaMotif *motif, int pos)
00286 /* Return the uncertainty at pos of motif.  This corresponds
00287  * to the H function in logo.pm */
00288 {
00289 return -( u1(motif->aProb[pos]) + u1(motif->cProb[pos])
00290         + u1(motif->gProb[pos]) +u1(motif->tProb[pos]) );
00291 }
00292 
00293 double dnaMotifBitsOfInfo(struct dnaMotif *motif, int pos)
00294 /* Return bits of information at position. */
00295 {
00296 if (pos > motif->columnCount || pos < 0)
00297     internalErr();
00298 return 2 - uncertainty(motif, pos);
00299 }
00300 
00301 struct letterProb
00302 /* A letter tied to a probability. */
00303     {
00304     struct letterProb *next;
00305     double prob;        /* Probability for this letter. */
00306     char letter;        /* The letter (upper case) */
00307     };
00308 
00309 static struct letterProb *letterProbNew(char letter, double prob)
00310 /* Make a new letterProb. */
00311 {
00312 struct letterProb *lp;
00313 AllocVar(lp);
00314 lp->letter = letter;
00315 lp->prob = prob;
00316 return lp;
00317 }
00318 
00319 static int letterProbCmp(const void *va, const void *vb)
00320 /* Compare to sort highest probability first. */
00321 {
00322 const struct letterProb *a = *((struct letterProb **)va);
00323 const struct letterProb *b = *((struct letterProb **)vb);
00324 double dif = a->prob - b->prob;
00325 if (dif < 0)
00326    return -1;
00327 else if (dif > 0)
00328    return 1;
00329 else
00330    return 0;
00331 }
00332 
00333 static void addBaseProb(struct letterProb **pList, char letter, double prob)
00334 /* If prob > 0 add letterProb to list. */
00335 {
00336 if (prob > 0)
00337     {
00338     struct letterProb *lp = letterProbNew(letter, prob);
00339     slAddHead(pList, lp);
00340     }
00341 }
00342 
00343 static struct letterProb *letterProbFromMotifColumn(struct dnaMotif *motif, int pos)
00344 /* Return letterProb list corresponding to column of motif. */
00345 {
00346 struct letterProb *lpList = NULL;
00347 addBaseProb(&lpList, 'A', motif->aProb[pos]);
00348 addBaseProb(&lpList, 'C', motif->cProb[pos]);
00349 addBaseProb(&lpList, 'G', motif->gProb[pos]);
00350 addBaseProb(&lpList, 'T', motif->tProb[pos]);
00351 slSort(&lpList, letterProbCmp);
00352 return lpList;
00353 }
00354 
00355 static void psOneColumn(struct dnaMotif *motif, int pos,
00356     double xStart, double yStart, double width, double totalHeight,
00357     FILE *f)
00358 /* Write one column of logo to postScript. */
00359 {
00360 struct letterProb *lp, *lpList = letterProbFromMotifColumn(motif, pos);
00361 double x = xStart, y = yStart, w = width, h;
00362 for (lp = lpList; lp != NULL; lp = lp->next)
00363     {
00364     h = totalHeight * lp->prob;
00365     if (h >= 1.0)
00366         {
00367         fprintf(f, "%cColor ", tolower(lp->letter));
00368         fprintf(f, "%3.2f ", x);
00369         fprintf(f, "%3.2f ", y);
00370         fprintf(f, "%3.2f ", x + w);
00371         fprintf(f, "%3.2f ", y + h);
00372         fprintf(f, "(%c) textInBox\n", lp->letter);
00373         }
00374     y += h;
00375     }
00376 fprintf(f, "\n");
00377 slFreeList(&lpList);
00378 }
00379 
00380 static void dnaMotifDims(struct dnaMotif *motif, double widthPerBase, double height, 
00381         int *retWidth, int *retHeight)
00382 /* Calculate dimensions of motif when rendered. */
00383 {
00384 static int widthFudgeFactor = 2, heightFudgeFactor = 2;
00385 *retWidth = ceil(widthPerBase * motif->columnCount) + widthFudgeFactor;
00386 *retHeight = ceil(height) + heightFudgeFactor;
00387 }
00388 
00389 void dnaMotifToLogoPs(struct dnaMotif *motif, double widthPerBase, double height, char *fileName)
00390 /* Write logo corresponding to motif to postScript file. */
00391 {
00392 FILE *f = mustOpen(fileName, "w");
00393 int i;
00394 int xStart = 0;
00395 int w, h;
00396 char *s = 
00397 #include "dnaMotif.pss"
00398 ;
00399 
00400 dnaMotifDims(motif, widthPerBase, height, &w, &h);
00401 fprintf(f, "%%!PS-Adobe-3.1 EPSF-3.0\n");
00402 fprintf(f, "%%%%BoundingBox: 0 0 %d %d\n\n", w, h);
00403 fprintf(f, "%s", s);
00404 
00405 fprintf(f, "%s", "% Start of code for this specific logo\n");
00406 
00407 for (i=0; i<motif->columnCount; ++i)
00408     {
00409     double infoScale = dnaMotifBitsOfInfo(motif, i)/2.0;
00410     psOneColumn(motif, i, xStart, 0, widthPerBase, infoScale * height, f);
00411     xStart += widthPerBase;
00412     }
00413 fprintf(f, "showpage\n");
00414 carefulClose(&f);
00415 }
00416 
00417 void dnaMotifToLogoPng(
00418         struct dnaMotif *motif, /* Motif to draw. */
00419         double widthPerBase,    /* Width of each base. */
00420         double height,          /* Max height. */
00421         char *gsExe,            /* ghostscript executable, NULL for default */
00422         char *tempDir,          /* temp dir , NULL for default */
00423         char *fileName)         /* output png file name. */
00424 /* Write logo corresponding to motif to png file. */
00425 {
00426 char *psName = rTempName(tempDir, "dnaMotif", ".ps");
00427 struct dyString *dy = dyStringNew(0);
00428 int w, h;
00429 int sysRet;
00430 
00431 if (gsExe == NULL) gsExe = "gs";
00432 if (tempDir == NULL) tempDir = "/tmp";
00433 dnaMotifToLogoPs(motif, widthPerBase, height, psName);
00434 dnaMotifDims(motif, widthPerBase, height, &w, &h);
00435 dyStringAppend(dy, gsExe);
00436 dyStringAppend(dy, " -sDEVICE=png16m -sOutputFile=");
00437 dyStringAppend(dy, fileName);
00438 dyStringAppend(dy, " -dBATCH -dNOPAUSE -q ");
00439 dyStringPrintf(dy, "-g%dx%d ", w, h);
00440 dyStringAppend(dy, psName);
00441 sysRet = system(dy->string);
00442 if (sysRet != 0)
00443     errAbort("System call returned %d for:\n  %s", sysRet, dy->string);
00444 
00445 /* Clean up. */
00446 dyStringFree(&dy);
00447 
00448 /* change permisssions so the webserver can access the file */
00449 dy = newDyString(0);
00450 dyStringPrintf(dy, "chmod 666 %s ", fileName);
00451 sysRet = system(dy->string);
00452 
00453 remove(psName);
00454 }
00455 
00456 void dnaMotifToLogoPsW(struct dnaMotif *motif, double widthPerBase, double width, double height, char *fileName)
00457 /* Write logo corresponding to motif to postScript file. */
00458 {
00459 FILE *f = mustOpen(fileName, "w");
00460 int i;
00461 int xStart = 0;
00462 int w, h;
00463 char *s = 
00464 #include "dnaMotif.pss"
00465 ;
00466 
00467 dnaMotifDims(motif, widthPerBase, height, &w, &h);
00468 fprintf(f, "%%!PS-Adobe-3.1 EPSF-3.0\n");
00469 fprintf(f, "%%%%BoundingBox: 0 0 %d %d\n\n", w, h);
00470 fprintf(f, "%s", s);
00471 
00472 fprintf(f, "%s", "% Start of code for this specific logo\n");
00473 
00474 for (i=0; i<motif->columnCount; ++i)
00475     {
00476     double infoScale = 0.9 ; 
00477     xStart = i * width / motif->columnCount;
00478     psOneColumn(motif, i, xStart, 0, widthPerBase, infoScale * height, f);
00479     }
00480 fprintf(f, "showpage\n");
00481 carefulClose(&f);
00482 }
00483 
00484 void dnaMotifToLogoPGM(
00485         struct dnaMotif *motif, /* Motif to draw. */
00486         double widthPerBase,    /* Width of each base. */
00487         double width,           /* Max width. */
00488         double height,          /* Max height. */
00489         char *gsExe,            /* ghostscript executable, NULL for default */
00490         char *tempDir,          /* temp dir , NULL for default */
00491         char *fileName)         /* output png file name. */
00492 /* Write logo corresponding to motif to pgm file. */
00493 {
00494 char *psName = rTempName(tempDir, "dnaMotif", ".ps");
00495 struct dyString *dy = dyStringNew(0);
00496 int w, h;
00497 int sysRet;
00498 
00499 if (gsExe == NULL) gsExe = "gs";
00500 if (tempDir == NULL) tempDir = "/tmp";
00501 dnaMotifToLogoPsW(motif, widthPerBase, width, height, psName);
00502 dnaMotifDims(motif, widthPerBase, height, &w, &h);
00503 dyStringAppend(dy, gsExe);
00504 dyStringAppend(dy, " -sDEVICE=pgmraw -sOutputFile=");
00505 dyStringAppend(dy, fileName);
00506 dyStringAppend(dy, " -dBATCH -dNOPAUSE -q ");
00507 dyStringPrintf(dy, "-g%dx%d ", (int) ceil(width), h);
00508 dyStringAppend(dy, psName);
00509 sysRet = system(dy->string);
00510 if (sysRet != 0)
00511     errAbort("System call returned %d for:\n  %s", sysRet, dy->string);
00512 
00513 /* Clean up. */
00514 dyStringFree(&dy);
00515 
00516 /* change permisssions so the webserver can access the file */
00517 dy = newDyString(0);
00518 dyStringPrintf(dy, "chmod 666 %s ", fileName);
00519 sysRet = system(dy->string);
00520 
00521 remove(psName);
00522 }

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2