lib/gapCalc.c

Go to the documentation of this file.
00001 /* gapCalc - Stuff to calculate complex (but linear) gap costs quickly,
00002  * and read specifications from a file. */
00003 
00004 #include "common.h"
00005 #include "linefile.h"
00006 #include "gapCalc.h"
00007 
00008 static char const rcsid[] = "$Id: gapCalc.c,v 1.4 2006/06/18 23:09:36 kate Exp $";
00009 
00010 struct gapCalc
00011 /* A structure that bundles together stuff to help us
00012  * calculate gap costs quickly. */
00013     {
00014     int smallSize; /* Size of tables for doing quick lookup of small gaps. */
00015     int *qSmall;   /* Table for small gaps in q; */
00016     int *tSmall;   /* Table for small gaps in t. */
00017     int *bSmall;   /* Table for small gaps in either. */
00018     int *longPos;/* Table of positions to interpolate between for larger gaps. */
00019     double *qLong; /* Values to interpolate between for larger gaps in q. */
00020     double *tLong; /* Values to interpolate between for larger gaps in t. */
00021     double *bLong; /* Values to interpolate between for larger gaps in both. */
00022     int longCount;      /* Number of long positions overall in longPos. */
00023     int qPosCount;      /* Number of long positions in q. */
00024     int tPosCount;      /* Number of long positions in t. */
00025     int bPosCount;      /* Number of long positions in b. */
00026     int qLastPos;       /* Maximum position we have data on in q. */
00027     int tLastPos;       /* Maximum position we have data on in t. */
00028     int bLastPos;       /* Maximum position we have data on in b. */
00029     double qLastPosVal; /* Value at max pos. */
00030     double tLastPosVal; /* Value at max pos. */
00031     double bLastPosVal; /* Value at max pos. */
00032     double qLastSlope;  /* What to add for each base after last. */
00033     double tLastSlope;  /* What to add for each base after last. */
00034     double bLastSlope;  /* What to add for each base after last. */
00035     };
00036 
00037 /* These are the gap costs used in the Evolution's Cauldron paper. */
00038 static char *originalGapCosts = 
00039     "tableSize 11\n"
00040     "smallSize 111\n"
00041     "position 1 2 3 11 111 2111 12111 32111 72111 152111 252111\n"
00042     "qGap 350 425 450 600 900 2900 22900 57900 117900 217900 317900\n"
00043     "tGap 350 425 450 600 900 2900 22900 57900 117900 217900 317900\n"
00044     "bothGap 750 825 850 1000 1300 3300 23300 58300 118300 218300 318300\n";
00045 
00046 /* These gap costs work well at chicken/human distances, and seem
00047  * to do ok closer as well, so they are now the default. */
00048 static char *defaultGapCosts =
00049 "tablesize       11\n"
00050 "smallSize       111\n"
00051 "position        1       2       3       11      111     2111    12111   32111   72111   152111  252111\n"
00052 "qGap    325     360     400     450     600     1100    3600    7600    15600   31600   56600\n"
00053 "tGap    325     360     400     450     600     1100    3600    7600    15600   31600   56600\n"
00054 "bothGap 625     660     700     750     900     1400    4000    8000    16000   32000   57000\n";
00055 
00056 /* These gap costs are for query=mRNA, target=DNA. */
00057 static char *rnaDnaGapCosts = 
00058 "tablesize       12\n"
00059 "smallSize       111\n"
00060 "position        1       2       3       11     31   111   2111    12111   32111   72111   152111  252111\n"
00061        "qGap    325     360     400     450     600  800   1100    3600    7600    15600   31600   56600\n"
00062        "tGap    200     210     220     250     300  400   500      600     800    1200     2000   4000\n"
00063 "       bothGap 625     660     700     750     900  1100  1400    4000    8000    16000   32000   57000\n";
00064 
00065 static char *cheapGapCosts = 
00066     "tableSize 3\n"
00067     "smallSize 100\n"
00068     "position 1 100 1000\n"
00069     "qGap 0 30 300\n"
00070     "tGap 0 30 300\n"
00071     "bothGap 0 30 300\n";
00072 
00073 
00074 char *gapCalcSampleFileContents()
00075 /* Return contents of a sample linear gap file. */
00076 {
00077 return defaultGapCosts;
00078 }
00079 
00080 static int interpolate(int x, int *s, double *v, int sCount)
00081 /* Find closest value to x in s, and then lookup corresponding
00082  * value in v.  Interpolate where necessary. */
00083 {
00084 int i, ds, ss;
00085 double dv;
00086 for (i=0; i<sCount; ++i)
00087     {
00088     ss = s[i];
00089     if (x == ss)
00090         return v[i];
00091     else if (x < ss)
00092         {
00093         ds = ss - s[i-1];
00094         dv = v[i] - v[i-1];
00095         return v[i-1] + dv * (x - s[i-1]) / ds;
00096         }
00097     }
00098 /* If get to here extrapolate from last two values */
00099 ds = s[sCount-1] - s[sCount-2];
00100 dv = v[sCount-1] - v[sCount-2];
00101 return v[sCount-2] + dv * (x - s[sCount-2]) / ds;
00102 }
00103 
00104 static double calcSlope(double y2, double y1, double x2, double x1)
00105 /* Calculate slope of line from x1/y1 to x2/y2 */
00106 {
00107 return (y2-y1)/(x2-x1);
00108 }
00109 
00110 static void readTaggedNumLine(struct lineFile *lf, char *tag, 
00111         int count, int *intOut,  double *floatOut)
00112 /* Read in a line that starts with tag and then has count numbers.
00113  * Complain and die if tag is unexpected or other problem occurs. 
00114  * Put output as integers and/or floating point into intOut and 
00115  * floatOut. */
00116 {
00117 char *line;
00118 int i = 0;
00119 char *word;
00120 if (!lineFileNextReal(lf, &line))
00121    lineFileUnexpectedEnd(lf);
00122 word = nextWord(&line);
00123 if (!sameWord(tag, word))
00124     errAbort("Expecting %s got %s line %d of %s",
00125              tag, word, lf->lineIx, lf->fileName);
00126 for (i = 0; i < count; ++i)
00127     {
00128     word = nextWord(&line);
00129     if (word == NULL)
00130         errAbort("Not enough numbers line %d of %s", lf->lineIx, lf->fileName);
00131     if (!isdigit(word[0]))
00132         errAbort("Expecting number got %s line %d of %s",
00133                  word, lf->lineIx, lf->fileName);
00134     if (intOut)
00135         intOut[i] = atoi(word);
00136     if (floatOut)
00137         floatOut[i] = atof(word);
00138     }
00139 word = nextWord(&line);
00140 if (word != NULL)
00141         errAbort("Too many numbers line %d of %s", lf->lineIx, lf->fileName);
00142 }
00143 
00144 struct gapCalc *gapCalcRead(struct lineFile *lf)
00145 /* Create gapCalc from open file. */
00146 {
00147 int i, tableSize, startLong = -1;
00148 struct gapCalc *gapCalc;
00149 int *gapInitPos;  
00150 double *gapInitQGap;  
00151 double *gapInitTGap;  
00152 double *gapInitBothGap;
00153 
00154 AllocVar(gapCalc);
00155 
00156 /* Parse file. */
00157 readTaggedNumLine(lf, "tableSize", 1, &tableSize, NULL);
00158 readTaggedNumLine(lf, "smallSize", 1, &gapCalc->smallSize, NULL);
00159 AllocArray(gapInitPos,tableSize);
00160 AllocArray(gapInitQGap,tableSize);
00161 AllocArray(gapInitTGap,tableSize);
00162 AllocArray(gapInitBothGap,tableSize);
00163 readTaggedNumLine(lf, "position", tableSize, gapInitPos, NULL);
00164 readTaggedNumLine(lf, "qGap", tableSize, NULL, gapInitQGap);
00165 readTaggedNumLine(lf, "tGap", tableSize, NULL, gapInitTGap);
00166 readTaggedNumLine(lf, "bothGap", tableSize, NULL, gapInitBothGap);
00167 
00168 /* Set up precomputed interpolations for small gaps. */
00169 AllocArray(gapCalc->qSmall, gapCalc->smallSize);
00170 AllocArray(gapCalc->tSmall, gapCalc->smallSize);
00171 AllocArray(gapCalc->bSmall, gapCalc->smallSize);
00172 for (i=1; i<gapCalc->smallSize; ++i)
00173     {
00174     gapCalc->qSmall[i] = 
00175         interpolate(i, gapInitPos, gapInitQGap, tableSize);
00176     gapCalc->tSmall[i] = 
00177         interpolate(i, gapInitPos, gapInitTGap, tableSize);
00178     gapCalc->bSmall[i] = interpolate(i, gapInitPos, 
00179         gapInitBothGap, tableSize);
00180     }
00181 
00182 /* Set up to handle intermediate values. */
00183 for (i=0; i<tableSize; ++i)
00184     {
00185     if (gapCalc->smallSize == gapInitPos[i])
00186         {
00187         startLong = i;
00188         break;
00189         }
00190     }
00191 if (startLong < 0)
00192     errAbort("No position %d in gapCalcRead()\n", gapCalc->smallSize);
00193 gapCalc->longCount = tableSize - startLong;
00194 gapCalc->qPosCount = tableSize - startLong;
00195 gapCalc->tPosCount = tableSize - startLong;
00196 gapCalc->bPosCount = tableSize - startLong;
00197 gapCalc->longPos = cloneMem(gapInitPos + startLong, gapCalc->longCount * sizeof(int));
00198 gapCalc->qLong = cloneMem(gapInitQGap + startLong, gapCalc->qPosCount * sizeof(double));
00199 gapCalc->tLong = cloneMem(gapInitTGap + startLong, gapCalc->tPosCount * sizeof(double));
00200 gapCalc->bLong = cloneMem(gapInitBothGap + startLong, gapCalc->bPosCount * sizeof(double));
00201 
00202 /* Set up to handle huge values. */
00203 gapCalc->qLastPos = gapCalc->longPos[gapCalc->qPosCount-1];
00204 gapCalc->tLastPos = gapCalc->longPos[gapCalc->tPosCount-1];
00205 gapCalc->bLastPos = gapCalc->longPos[gapCalc->bPosCount-1];
00206 gapCalc->qLastPosVal = gapCalc->qLong[gapCalc->qPosCount-1];
00207 gapCalc->tLastPosVal = gapCalc->tLong[gapCalc->tPosCount-1];
00208 gapCalc->bLastPosVal = gapCalc->bLong[gapCalc->bPosCount-1];
00209 gapCalc->qLastSlope = calcSlope(gapCalc->qLastPosVal, gapCalc->qLong[gapCalc->qPosCount-2],
00210                            gapCalc->qLastPos, gapCalc->longPos[gapCalc->qPosCount-2]);
00211 gapCalc->tLastSlope = calcSlope(gapCalc->tLastPosVal, gapCalc->tLong[gapCalc->tPosCount-2],
00212                            gapCalc->tLastPos, gapCalc->longPos[gapCalc->tPosCount-2]);
00213 gapCalc->bLastSlope = calcSlope(gapCalc->bLastPosVal, gapCalc->bLong[gapCalc->bPosCount-2],
00214                            gapCalc->bLastPos, gapCalc->longPos[gapCalc->bPosCount-2]);
00215 freez(&gapInitPos);
00216 freez(&gapInitQGap);
00217 freez(&gapInitTGap);
00218 freez(&gapInitBothGap);
00219 return gapCalc;
00220 }
00221 
00222 struct gapCalc *gapCalcFromString(char *s)
00223 /* Return gapCalc from description string. */
00224 {
00225 struct lineFile *lf = lineFileOnString("string", TRUE, cloneString(s));
00226 struct gapCalc *gapCalc = gapCalcRead(lf);
00227 lineFileClose(&lf);
00228 return gapCalc;
00229 }
00230 
00231 struct gapCalc *gapCalcFromFile(char *fileName)
00232 /* Return gapCalc from file. */
00233 {
00234 struct gapCalc *gapCalc = NULL;
00235 
00236 if (sameString(fileName, "loose"))
00237     {
00238     verbose(2, "using loose linear gap costs (chicken/human)\n");
00239     gapCalc = gapCalcFromString(defaultGapCosts);
00240     }
00241 else if (sameString(fileName, "medium"))
00242     {
00243     verbose(2, "using medium (original) linear gap costs (mouse/human)\n");
00244     gapCalc = gapCalcFromString(originalGapCosts);
00245     }
00246 else
00247     {
00248     struct lineFile *lf = lineFileOpen(fileName, TRUE);
00249     gapCalc = gapCalcRead(lf);
00250     lineFileClose(&lf);
00251     }
00252 return gapCalc;
00253 }
00254 
00255 struct gapCalc *gapCalcDefault()
00256 /* Return default gapCalc. */
00257 {
00258 return gapCalcFromString(defaultGapCosts);
00259 }
00260 
00261 struct gapCalc *gapCalcRnaDna()
00262 /* Return gaps suitable for RNA queries vs. DNA targets */
00263 {
00264 return gapCalcFromString(rnaDnaGapCosts);
00265 }
00266 
00267 struct gapCalc *gapCalcCheap()
00268 /* Return cheap gap costs. */
00269 {
00270 return gapCalcFromString(cheapGapCosts);
00271 }
00272 
00273 struct gapCalc *gapCalcOriginal()
00274 /* Return gap costs from original paper. */
00275 {
00276 return gapCalcFromString(originalGapCosts);
00277 }
00278 
00279 void gapCalcFree(struct gapCalc **pGapCalc)
00280 /* Free up resources associated with gapCalc. */
00281 {
00282 struct gapCalc *gapCalc = *pGapCalc;
00283 if (gapCalc != NULL)
00284     {
00285     freeMem(gapCalc->qSmall);
00286     freeMem(gapCalc->tSmall);
00287     freeMem(gapCalc->bSmall);
00288     freeMem(gapCalc->longPos);
00289     freeMem(gapCalc->qLong);
00290     freeMem(gapCalc->tLong);
00291     freeMem(gapCalc->bLong);
00292     freez(pGapCalc);
00293     }
00294 }
00295 
00296 int gapCalcCost(struct gapCalc *gapCalc, int dq, int dt)
00297 /* Figure out gap costs. */
00298 {
00299 if (dt < 0) dt = 0;
00300 if (dq < 0) dq = 0;
00301 if (dt == 0)
00302     { 
00303     if (dq < gapCalc->smallSize)
00304         return gapCalc->qSmall[dq];
00305     else if (dq >= gapCalc->qLastPos)
00306         return gapCalc->qLastPosVal + gapCalc->qLastSlope * (dq-gapCalc->qLastPos);
00307     else
00308         return interpolate(dq, gapCalc->longPos, gapCalc->qLong, gapCalc->qPosCount);
00309     }
00310 else if (dq == 0)
00311     {
00312     if (dt < gapCalc->smallSize)
00313         return gapCalc->tSmall[dt];
00314     else if (dt >= gapCalc->tLastPos)
00315         return gapCalc->tLastPosVal + gapCalc->tLastSlope * (dt-gapCalc->tLastPos);
00316     else
00317         return interpolate(dt, gapCalc->longPos, gapCalc->tLong, gapCalc->tPosCount);
00318     }
00319 else
00320     {
00321     int both = dq + dt;
00322     if (both < gapCalc->smallSize)
00323         return gapCalc->bSmall[both];
00324     else if (both >= gapCalc->bLastPos)
00325         return gapCalc->bLastPosVal + gapCalc->bLastSlope * (both-gapCalc->bLastPos);
00326     else
00327         return interpolate(both, gapCalc->longPos, gapCalc->bLong, gapCalc->bPosCount);
00328     }
00329 }
00330 
00331 void gapCalcTest(struct gapCalc *gapCalc)
00332 /* Print out gap cost info. */
00333 {
00334 int i;
00335 for (i=1; i<=10; i++)
00336    {
00337    verbose(1, "%d: %d %d %d\n", i, gapCalcCost(gapCalc, i, 0), 
00338            gapCalcCost(gapCalc, 0, i), gapCalcCost(gapCalc, i/2, i-i/2));
00339    }
00340 for (i=1; ; i *= 10)
00341    {
00342    verbose(1, "%d: %d %d %d\n", i, gapCalcCost(gapCalc, i, 0), gapCalcCost(gapCalc, 0, i), 
00343            gapCalcCost(gapCalc, i/2, i-i/2));
00344    if (i == 1000000000)
00345        break;
00346    }
00347 verbose(1, "%d %d cost %d\n", 6489540, 84240, gapCalcCost(gapCalc, 84240, 6489540));
00348 verbose(1, "%d %d cost %d\n", 2746361, 1075188, gapCalcCost(gapCalc, 1075188, 2746361));
00349 verbose(1, "%d %d cost %d\n", 6489540 + 2746361 + 72, 84240 + 1075188 + 72, gapCalcCost(gapCalc, 84240 + 1075188 + 72, 6489540 + 2746361 + 72));
00350 }
00351 
00352 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2