00001
00002
00003
00004 #include "common.h"
00005 #include "memalloc.h"
00006 #include "cheapcgi.h"
00007 #include "dnautil.h"
00008 #include "xenalign.h"
00009 #include "pairHmm.h"
00010
00011 static char const rcsid[] = "$Id: xensmall.c,v 1.11 2006/03/15 18:36:16 angie Exp $";
00012
00013 static double calcGcRatio(DNA *a, int aSize, DNA *b, int bSize)
00014
00015 {
00016 int counts[4];
00017 int i;
00018 int total = 4;
00019 DNA base;
00020 int val;
00021 int gcCount;
00022
00023 for (i=0; i<ArraySize(counts); ++i)
00024 counts[i] = 1;
00025 for (i=0; i<aSize; ++i)
00026 {
00027 base = a[i];
00028 if ((val = ntVal[(int)base]) >= 0)
00029 {
00030 counts[val] += 1;
00031 total += 1;
00032 }
00033 }
00034 for (i=0; i<bSize; ++i)
00035 {
00036 base = b[i];
00037 if ((val = ntVal[(int)base]) >= 0)
00038 {
00039 counts[val] += 1;
00040 total += 1;
00041 }
00042 }
00043 gcCount = counts[C_BASE_VAL] + counts[G_BASE_VAL];
00044 return (double)gcCount/(double)total;
00045 }
00046
00047
00048 enum sevenStateIx
00049 {
00050 hiFiIx = 0,
00051 qSlipIx,
00052 tSlipIx,
00053 loFiIx,
00054 c1Ix,
00055 c2Ix,
00056 c3Ix,
00057 };
00058
00059
00060 static int loFiFromHiFiCost;
00061 static int loFiFromCodingCost;
00062 static int loFiFromInsertCost;
00063
00064 static int hiFiFromLoFiCost;
00065 static int hiFiFromCodingCost;
00066 static int hiFiFromInsertCost;
00067
00068 static int codingFromHiFiCost;
00069 static int codingFromLoFiCost;
00070 static int codingFromInsertCost;
00071
00072 static int insertFromHiFiCost;
00073 static int insertFromLoFiCost;
00074 static int insertFromCodingCost;
00075
00076 static int loFiSmallGapCost;
00077 static int hiFiSmallGapCost;
00078 static int codingSmallGapCost;
00079 static int largeGapExtensionCost;
00080
00081
00082 static int scaledLogOfFraction(double p, double q)
00083
00084 {
00085 return round(1000*log((double)p/q) );
00086 }
00087
00088 static int transitionCost(double prob)
00089
00090 {
00091 return round(1000*(log(prob)));
00092 }
00093
00094 static int c1c2MatchTable[26*32];
00095 static int c3MatchTable[26*32];
00096 static int hiFiMatchTable[26*32];
00097 static int loFiMatchTable[26*32];
00098
00099 #define letterIx(a) ((a)-'a')
00100
00101 static void makeMatchTable(int matchTable[], double matchProb, double gcRatio)
00102
00103
00104 {
00105 double atRatio = 1.0 - gcRatio;
00106 int unlikely = round(1000*log(0.0001));
00107 double mismatchProb = 1.0 - matchProb;
00108 double freq[4];
00109 DNA a,b;
00110 int i,j;
00111
00112 freq[A_BASE_VAL] = freq[T_BASE_VAL] = atRatio/2;
00113 freq[G_BASE_VAL] = freq[C_BASE_VAL] = gcRatio/2;
00114
00115 for (i=0; i<26*32; ++i)
00116 matchTable[i] = unlikely;
00117
00118 for (i=0; i<4; ++i)
00119 {
00120 a = valToNt[i];
00121 for (j=0; j<4; ++j)
00122 {
00123 b = valToNt[j];
00124 if (a == b)
00125 matchTable[32*letterIx(a) + letterIx(b)] = scaledLogOfFraction(matchProb, freq[j]);
00126 else
00127 matchTable[32*letterIx(a) + letterIx(b)] = scaledLogOfFraction(mismatchProb, 1.0 - freq[j]);
00128 }
00129 }
00130 }
00131
00132 static void makeWobbleMatchTable(int matchTable[], double gcRatio)
00133
00134 {
00135
00136 double atRatio = 1.0 - gcRatio;
00137 int unlikely = round(1000*log(0.0001));
00138 double freq[4];
00139 int i;
00140 double matchProb = 0.53;
00141 double wobbleProb = 0.33;
00142 double mismatchProb = 0.14*0.5;
00143
00144 freq[A_BASE_VAL] = freq[T_BASE_VAL] = atRatio/2;
00145 freq[G_BASE_VAL] = freq[C_BASE_VAL] = gcRatio/2;
00146
00147 for (i=0; i<26*32; ++i)
00148 matchTable[i] = unlikely;
00149 matchTable[32*letterIx('a') + letterIx('a')] = scaledLogOfFraction(matchProb, freq[A_BASE_VAL]);
00150 matchTable[32*letterIx('a') + letterIx('c')] = scaledLogOfFraction(mismatchProb, freq[C_BASE_VAL]);
00151 matchTable[32*letterIx('a') + letterIx('g')] = scaledLogOfFraction(wobbleProb, freq[G_BASE_VAL]);
00152 matchTable[32*letterIx('a') + letterIx('t')] = scaledLogOfFraction(mismatchProb, freq[T_BASE_VAL]);
00153
00154 matchTable[32*letterIx('c') + letterIx('a')] = scaledLogOfFraction(mismatchProb, freq[A_BASE_VAL]);
00155 matchTable[32*letterIx('c') + letterIx('c')] = scaledLogOfFraction(matchProb, freq[C_BASE_VAL]);
00156 matchTable[32*letterIx('c') + letterIx('g')] = scaledLogOfFraction(mismatchProb, freq[G_BASE_VAL]);
00157 matchTable[32*letterIx('c') + letterIx('t')] = scaledLogOfFraction(wobbleProb, freq[T_BASE_VAL]);
00158
00159 matchTable[32*letterIx('g') + letterIx('a')] = scaledLogOfFraction(wobbleProb, freq[A_BASE_VAL]);
00160 matchTable[32*letterIx('g') + letterIx('c')] = scaledLogOfFraction(mismatchProb, freq[C_BASE_VAL]);
00161 matchTable[32*letterIx('g') + letterIx('g')] = scaledLogOfFraction(matchProb, freq[G_BASE_VAL]);
00162 matchTable[32*letterIx('g') + letterIx('t')] = scaledLogOfFraction(mismatchProb, freq[T_BASE_VAL]);
00163
00164 matchTable[32*letterIx('t') + letterIx('a')] = scaledLogOfFraction(mismatchProb, freq[A_BASE_VAL]);
00165 matchTable[32*letterIx('t') + letterIx('c')] = scaledLogOfFraction(wobbleProb, freq[C_BASE_VAL]);
00166 matchTable[32*letterIx('t') + letterIx('g')] = scaledLogOfFraction(mismatchProb, freq[G_BASE_VAL]);
00167 matchTable[32*letterIx('t') + letterIx('t')] = scaledLogOfFraction(matchProb, freq[T_BASE_VAL]);
00168 }
00169
00170 static void calcCostBenefit(double gcRatio)
00171
00172 {
00173 loFiFromHiFiCost = transitionCost(1.0/30.0);
00174 codingFromHiFiCost = transitionCost(1.0/60.0);
00175 insertFromHiFiCost = transitionCost(1.0/1000.0);
00176
00177 loFiFromCodingCost = transitionCost(1.0/200.0);
00178 hiFiFromCodingCost = transitionCost(1.0/300.0);
00179 insertFromCodingCost = transitionCost(1.0/1000.0);
00180
00181 hiFiFromLoFiCost = transitionCost(1.0/100.0);
00182 codingFromLoFiCost = transitionCost(1.0/150.0);
00183 insertFromLoFiCost = transitionCost(1.0/400.0);
00184
00185 loFiFromInsertCost = transitionCost(1.0/60.0);
00186 hiFiFromInsertCost = transitionCost(1.0/1000.0);
00187 codingFromInsertCost = transitionCost(1.0/1000.0);
00188
00189 largeGapExtensionCost = scaledLogOfFraction(74,75);
00190
00191 loFiSmallGapCost = scaledLogOfFraction(1,10);
00192 hiFiSmallGapCost = scaledLogOfFraction(1,25);
00193 codingSmallGapCost = scaledLogOfFraction(1,300);
00194
00195 makeMatchTable(c1c2MatchTable, 0.92, gcRatio);
00196
00197
00198
00199 makeWobbleMatchTable(c3MatchTable, gcRatio);
00200
00201
00202
00203
00204 makeMatchTable(loFiMatchTable, 0.5, gcRatio);
00205
00206
00207
00208 makeMatchTable(hiFiMatchTable, 0.9, gcRatio);
00209
00210
00211 }
00212
00213
00214 int xenAlignSmall(DNA *query, int querySize, DNA *target, int targetSize, FILE *f,
00215 boolean printExtraAtEnds)
00216
00217
00218 {
00219 struct phmmMatrix *a;
00220 struct phmmState *hf, *lf, *iq, *it, *c1, *c2, *c3;
00221 int qIx, tIx, sIx;
00222 int rowOffset, newCellOffset;
00223 struct phmmAliPair *pairList;
00224 int matchOff, qSlipOff, tSlipOff;
00225 int bestScore = -0x4fffffff;
00226 struct phmmMommy *bestCell = NULL;
00227 int c1c2PairScore, c3PairScore, loFiPairScore, hiFiPairScore;
00228 int matchTableOffset;
00229 double gcRatio;
00230
00231
00232 if ((double)targetSize * querySize > 1.1E7)
00233 errAbort("Can't align %d x %d, too big\n", querySize, targetSize);
00234
00235
00236 gcRatio = calcGcRatio(query, querySize, target, targetSize);
00237 calcCostBenefit(gcRatio);
00238
00239
00240 a = phmmMatrixNew(7, query, querySize, target, targetSize);
00241 hf = phmmNameState(a, hiFiIx, "highFi", 'H');
00242 lf = phmmNameState(a, loFiIx, "lowFi", 'L');
00243 iq = phmmNameState(a, qSlipIx, "qSlip", 'Q');
00244 it = phmmNameState(a, tSlipIx, "tSlip", 'T');
00245 c1 = phmmNameState(a, c1Ix, "frame1", '1');
00246 c2 = phmmNameState(a, c2Ix, "frame2", '2');
00247 c3 = phmmNameState(a, c3Ix, "frame3", '3');
00248
00249 qSlipOff = -a->qDim;
00250 tSlipOff = -1;
00251 matchOff = qSlipOff + tSlipOff;
00252
00253 for (tIx = 1; tIx < a->tDim; tIx += 1)
00254 {
00255 UBYTE mommy = 0;
00256 int score, tempScore;
00257
00258
00259 #define matchScore lastScores[qIx-1]
00260 #define qSlipScore lastScores[qIx]
00261 #define tSlipScore scores[qIx-1]
00262 #define newScore scores[qIx]
00263
00264
00265 #define startState(state) \
00266 score = 0;
00267
00268
00269
00270 #define matchState(state, addScore) \
00271 { \
00272 if ((tempScore = state->matchScore + addScore) > score) \
00273 { \
00274 mommy = phmmPackMommy(state->stateIx, -1, -1); \
00275 score = tempScore; \
00276 } \
00277 }
00278
00279
00280
00281 #define qSlipState(state, addScore) \
00282 { \
00283 if ((tempScore = state->qSlipScore + addScore) > score) \
00284 { \
00285 mommy = phmmPackMommy(state->stateIx, 0, -1); \
00286 score = tempScore; \
00287 } \
00288 }
00289
00290
00291
00292 #define tSlipState(state, addScore) \
00293 { \
00294 if ((tempScore = state->tSlipScore + addScore) > score) \
00295 { \
00296 mommy = phmmPackMommy(state->stateIx, -1, 0); \
00297 score = tempScore; \
00298 } \
00299 }
00300
00301
00302 #define endState(state) \
00303 { \
00304 struct phmmMommy *newCell = state->cells + newCellOffset; \
00305 if (score <= 0) \
00306 { \
00307 mommy = phmmNullMommy; \
00308 score = 0; \
00309 } \
00310 newCell->mommy = mommy; \
00311 state->newScore = score; \
00312 if (score > bestScore) \
00313 { \
00314 bestScore = score; \
00315 bestCell = newCell; \
00316 } \
00317 }
00318
00319
00320
00321 #define shortEndState(state) \
00322 { \
00323 struct phmmMommy *newCell = state->cells + newCellOffset; \
00324 if (score <= 0) \
00325 { \
00326 mommy = phmmNullMommy; \
00327 score = 0; \
00328 } \
00329 newCell->mommy = mommy; \
00330 state->newScore = score; \
00331 }
00332
00333 rowOffset = tIx*a->qDim;
00334 for (qIx = 1; qIx < a->qDim; qIx += 1)
00335 {
00336 int qBase = letterIx(a->query[qIx-1]);
00337 int tBase = letterIx(a->target[tIx-1]);
00338
00339 newCellOffset = rowOffset + qIx;
00340
00341
00342 matchTableOffset = (qBase<<5) + tBase;
00343 c1c2PairScore = c1c2MatchTable[matchTableOffset];
00344 c3PairScore = c3MatchTable[matchTableOffset];
00345 hiFiPairScore = hiFiMatchTable[matchTableOffset];
00346 loFiPairScore = loFiMatchTable[matchTableOffset];
00347
00348
00349 {
00350 startState(hf);
00351 matchState(hf, hiFiPairScore);
00352 qSlipState(hf, hiFiSmallGapCost);
00353 tSlipState(hf, hiFiSmallGapCost);
00354 matchState(iq, hiFiPairScore + hiFiFromInsertCost);
00355 matchState(it, hiFiPairScore + hiFiFromInsertCost);
00356 matchState(lf, hiFiPairScore + hiFiFromLoFiCost);
00357 matchState(c1, hiFiPairScore + hiFiFromCodingCost);
00358 matchState(c2, hiFiPairScore + hiFiFromCodingCost);
00359 matchState(c3, hiFiPairScore + hiFiFromCodingCost);
00360 endState(hf);
00361 }
00362
00363
00364 {
00365 startState(lf);
00366 matchState(lf, loFiPairScore);
00367 qSlipState(lf, loFiSmallGapCost);
00368 tSlipState(lf, loFiSmallGapCost);
00369 matchState(iq, loFiPairScore + loFiFromInsertCost);
00370 matchState(it, loFiPairScore + loFiFromInsertCost);
00371 matchState(hf, loFiPairScore + loFiFromHiFiCost);
00372 matchState(c1, loFiPairScore + loFiFromCodingCost);
00373 matchState(c2, loFiPairScore + loFiFromCodingCost);
00374 matchState(c3, loFiPairScore + loFiFromCodingCost);
00375 endState(lf);
00376 }
00377
00378
00379 {
00380 startState(iq);
00381 qSlipState(iq, largeGapExtensionCost);
00382 qSlipState(hf, insertFromHiFiCost);
00383 qSlipState(lf, insertFromLoFiCost);
00384 qSlipState(c1, insertFromCodingCost);
00385 qSlipState(c2, insertFromCodingCost);
00386 qSlipState(c3, insertFromCodingCost);
00387 qSlipState(it, largeGapExtensionCost);
00388 shortEndState(iq);
00389 }
00390
00391
00392 {
00393 startState(it);
00394 tSlipState(it, largeGapExtensionCost);
00395 tSlipState(hf, insertFromHiFiCost);
00396 tSlipState(lf, insertFromLoFiCost);
00397 tSlipState(c1, insertFromCodingCost);
00398 tSlipState(c2, insertFromCodingCost);
00399 tSlipState(c3, insertFromCodingCost);
00400 shortEndState(it);
00401 }
00402
00403
00404 {
00405 startState(c1);
00406 matchState(c3, c1c2PairScore);
00407 qSlipState(c1, codingSmallGapCost);
00408 tSlipState(c1, codingSmallGapCost);
00409 matchState(iq, c1c2PairScore + codingFromInsertCost);
00410 matchState(it, c1c2PairScore + codingFromInsertCost);
00411 matchState(lf, c1c2PairScore + codingFromLoFiCost);
00412 matchState(hf, c1c2PairScore + codingFromHiFiCost);
00413 endState(c1);
00414 }
00415
00416
00417 {
00418 startState(c2);
00419 matchState(c1, c1c2PairScore);
00420 qSlipState(c2, codingSmallGapCost);
00421 tSlipState(c2, codingSmallGapCost);
00422 matchState(iq, c1c2PairScore + codingFromInsertCost);
00423 matchState(it, c1c2PairScore + codingFromInsertCost);
00424 matchState(lf, c1c2PairScore + codingFromLoFiCost);
00425 matchState(hf, c1c2PairScore + codingFromHiFiCost);
00426 endState(c2);
00427 }
00428
00429
00430
00431 {
00432 startState(c3);
00433 matchState(c2, c3PairScore);
00434 qSlipState(c3, codingSmallGapCost);
00435 tSlipState(c3, codingSmallGapCost);
00436 matchState(iq, c3PairScore + codingFromInsertCost);
00437 matchState(it, c3PairScore + codingFromInsertCost);
00438 matchState(lf, c3PairScore + codingFromLoFiCost);
00439 matchState(hf, c3PairScore + codingFromHiFiCost);
00440 endState(c3);
00441 }
00442 }
00443
00444
00445 for (sIx = 0; sIx < a->stateCount; ++sIx)
00446 {
00447 struct phmmState *as = &a->states[sIx];
00448 int *swapTemp = as->lastScores;
00449 as->lastScores = as->scores;
00450 as->scores = swapTemp;
00451 }
00452 }
00453
00454
00455 pairList = phmmTraceBack(a, bestCell);
00456 phmmPrintTrace(a, pairList, TRUE, f, printExtraAtEnds);
00457
00458 slFreeList(&pairList);
00459 phmmMatrixFree(&a);
00460 return bestScore;
00461 #undef matchScore
00462 #undef qSlipScore
00463 #undef tSlipScore
00464 #undef newScore
00465 #undef startState
00466 #undef matchState
00467 #undef qSlipState
00468 #undef tSlipState
00469 #undef shortEndState
00470 #undef endState
00471 }
00472