00001
00002
00003
00004
00005
00006 #include "common.h"
00007 #include "pairHmm.h"
00008 #include "axt.h"
00009
00010 static char const rcsid[] = "$Id: axtAffine.c,v 1.7 2005/04/10 14:41:20 markd Exp $";
00011
00012
00013 boolean axtAffineSmallEnough(double querySize, double targetSize)
00014
00015
00016 {
00017 return targetSize * querySize <= 1.0E8;
00018 }
00019
00020 static void affineAlign(char *query, int querySize,
00021 char *target, int targetSize, struct axtScoreScheme *ss,
00022 struct phmmMatrix **retMatrix, struct phmmAliPair **retPairList,
00023 int *retScore)
00024
00025
00026 {
00027 struct phmmMatrix *a;
00028 struct phmmState *hf, *iq, *it;
00029 int qIx, tIx, sIx;
00030 int rowOffset, newCellOffset;
00031 int bestScore = -0x4fffffff;
00032 struct phmmMommy *bestCell = NULL;
00033 int matchPair;
00034 int gapStart, gapExt;
00035
00036
00037 if (!axtAffineSmallEnough(querySize, targetSize))
00038 errAbort("Can't align %d x %d, too big\n", querySize, targetSize);
00039
00040 gapStart = -ss->gapOpen;
00041 gapExt = -ss->gapExtend;
00042
00043
00044 a = phmmMatrixNew(3, query, querySize, target, targetSize);
00045 hf = phmmNameState(a, 0, "match", 'M');
00046 iq = phmmNameState(a, 1, "qSlip", 'Q');
00047 it = phmmNameState(a, 2, "tSlip", 'T');
00048
00049 for (tIx = 1; tIx < a->tDim; tIx += 1)
00050 {
00051 UBYTE mommy = 0;
00052 int score, tempScore;
00053
00054
00055 #define matchScore lastScores[qIx-1]
00056 #define qSlipScore lastScores[qIx]
00057 #define tSlipScore scores[qIx-1]
00058 #define newScore scores[qIx]
00059
00060
00061 #define startState(state) \
00062 score = 0;
00063
00064
00065
00066 #define matchState(state, addScore) \
00067 { \
00068 if ((tempScore = state->matchScore + addScore) > score) \
00069 { \
00070 mommy = phmmPackMommy(state->stateIx, -1, -1); \
00071 score = tempScore; \
00072 } \
00073 }
00074
00075
00076
00077 #define qSlipState(state, addScore) \
00078 { \
00079 if ((tempScore = state->qSlipScore + addScore) > score) \
00080 { \
00081 mommy = phmmPackMommy(state->stateIx, 0, -1); \
00082 score = tempScore; \
00083 } \
00084 }
00085
00086
00087
00088 #define tSlipState(state, addScore) \
00089 { \
00090 if ((tempScore = state->tSlipScore + addScore) > score) \
00091 { \
00092 mommy = phmmPackMommy(state->stateIx, -1, 0); \
00093 score = tempScore; \
00094 } \
00095 }
00096
00097
00098 #define endState(state) \
00099 { \
00100 struct phmmMommy *newCell = state->cells + newCellOffset; \
00101 if (score <= 0) \
00102 { \
00103 mommy = phmmNullMommy; \
00104 score = 0; \
00105 } \
00106 newCell->mommy = mommy; \
00107 state->newScore = score; \
00108 if (score > bestScore) \
00109 { \
00110 bestScore = score; \
00111 bestCell = newCell; \
00112 } \
00113 }
00114
00115
00116
00117 #define shortEndState(state) \
00118 { \
00119 struct phmmMommy *newCell = state->cells + newCellOffset; \
00120 if (score <= 0) \
00121 { \
00122 mommy = phmmNullMommy; \
00123 score = 0; \
00124 } \
00125 newCell->mommy = mommy; \
00126 state->newScore = score; \
00127 }
00128
00129
00130 rowOffset = tIx*a->qDim;
00131 for (qIx = 1; qIx < a->qDim; qIx += 1)
00132 {
00133 newCellOffset = rowOffset + qIx;
00134
00135
00136 matchPair = ss->matrix[(int)a->query[qIx-1]][(int)a->target[tIx-1]];
00137
00138
00139 {
00140 startState(hf);
00141 matchState(hf, matchPair);
00142 matchState(iq, matchPair);
00143 matchState(it, matchPair);
00144 endState(hf);
00145 }
00146
00147
00148 {
00149 startState(iq);
00150 qSlipState(iq, gapExt);
00151 qSlipState(hf, gapStart);
00152 qSlipState(it, gapStart);
00153 shortEndState(iq);
00154 }
00155
00156
00157 {
00158 startState(it);
00159 tSlipState(it, gapExt);
00160 tSlipState(hf, gapStart);
00161 shortEndState(it);
00162 }
00163
00164 }
00165
00166
00167 for (sIx = 0; sIx < a->stateCount; ++sIx)
00168 {
00169 struct phmmState *as = &a->states[sIx];
00170 int *swapTemp = as->lastScores;
00171 as->lastScores = as->scores;
00172 as->scores = swapTemp;
00173 }
00174 }
00175
00176
00177 *retPairList = phmmTraceBack(a, bestCell);
00178 *retMatrix = a;
00179 *retScore = bestScore;
00180
00181 #undef matchScore
00182 #undef qSlipScore
00183 #undef tSlipScore
00184 #undef newScore
00185 #undef startState
00186 #undef matchState
00187 #undef qSlipState
00188 #undef tSlipState
00189 #undef shortEndState
00190 #undef endState
00191 }
00192
00193 struct axt *axtAffine(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss)
00194
00195 {
00196 struct axt *axt;
00197 int score;
00198 struct phmmMatrix *matrix;
00199 struct phmmAliPair *pairList;
00200
00201 affineAlign(query->dna, query->size, target->dna, target->size, ss,
00202 &matrix, &pairList, &score);
00203 axt = phhmTraceToAxt(matrix, pairList, score, query->name, target->name);
00204 phmmMatrixFree(&matrix);
00205 slFreeList(&pairList);
00206 return axt;
00207 }
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231 #define WORST 0xC0000000
00232
00233
00234
00235
00236 struct cell2L
00237 {
00238 int bestm;
00239 int bestd;
00240 int besti;
00241 char backm;
00242 char backd;
00243 char backi;
00244 };
00245
00246 #ifdef DEBUG
00247 void dump2L(struct cell2L* c)
00248
00249
00250
00251
00252
00253 {
00254 printf("%04d%c %04d%c %04d%c ",
00255 c->bestd, c->backd,
00256 c->bestm, c->backm,
00257 c->besti, c->backi
00258 );
00259 }
00260 #endif
00261
00262 void kForwardAffine(
00263 struct cell2L *cells,
00264 int row,
00265 int rowmax,
00266 int rdelta,
00267 int cmost,
00268 int lv,
00269 char *q,
00270 char *t,
00271 struct axtScoreScheme *ss,
00272 int *bestbestOut,
00273 int *bestrOut,
00274 int *bestcOut,
00275 char *bestdirOut
00276 )
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292 {
00293 int r=0, rr=0;
00294 int gapOpen =ss->gapOpen;
00295 int gapExtend=ss->gapExtend;
00296 int doubleGap=ss->gapExtend;
00297 struct cell2L *cellp,*cellc;
00298 struct cell2L *u,*d,*l,*s;
00299 int c=0;
00300 int bestbest = *bestbestOut;
00301 cellc = cells+(row-1)*lv;
00302
00303 #ifdef DEBUG
00304 for(c=0;c<=cmost;c++)
00305 { dump2L(cellc+c); }
00306 printf("\n");
00307 #endif
00308
00309 for(r=row; r<=rowmax; r++)
00310 {
00311 cellp = cellc;
00312 cellc += lv;
00313
00314 rr = r+rdelta;
00315
00316 d = cellp;
00317 l = cellc;
00318 u = d+1;
00319 s = l+1;
00320
00321
00322 l->bestm=WORST;
00323 l->bestd=d->bestd-gapExtend;
00324 l->besti=WORST;
00325 l->backm='x';
00326 l->backd='d';
00327 l->backi='x';
00328 if (rr==1)
00329 {
00330 l->bestd=-gapOpen;
00331 l->backd='m';
00332 }
00333 #ifdef DEBUG
00334 dump2L(cellc);
00335 #endif
00336
00337 for(c=1; c<=cmost; c++)
00338 {
00339
00340 int best=WORST;
00341 int try =WORST;
00342 char dir=' ';
00343
00344 int subst = ss->matrix[(int)q[c-1]][(int)t[rr-1]];
00345
00346
00347 best=WORST;
00348 try=d->bestd;
00349 if (try > best)
00350 {
00351 best=try;
00352 dir='d';
00353 }
00354 try=d->bestm;
00355 if (try > best)
00356 {
00357 best=try;
00358 dir='m';
00359 }
00360 try=d->besti;
00361 if (try > best)
00362 {
00363 best=try;
00364 dir='i';
00365 }
00366 try=0;
00367 if (try > best)
00368 {
00369 best=try;
00370 dir='s';
00371 }
00372 best += subst;
00373 s->bestm = best;
00374 s->backm = dir;
00375 if (best > bestbest)
00376 {
00377 bestbest=best;
00378 *bestbestOut=best;
00379 *bestrOut=rr;
00380 *bestcOut=c;
00381 *bestdirOut=dir;
00382 }
00383
00384
00385 best=WORST;
00386 try=u->bestd - gapExtend;
00387 if (try > best)
00388 {
00389 best=try;
00390 dir='d';
00391 }
00392 try=u->bestm - gapOpen;
00393 if (try > best)
00394 {
00395 best=try;
00396 dir='m';
00397 }
00398 try=u->besti - doubleGap;
00399 if (try > best)
00400 {
00401 best=try;
00402 dir='i';
00403 }
00404 s->bestd = best;
00405 s->backd = dir;
00406 if (best > bestbest)
00407 {
00408 bestbest=best;
00409 *bestbestOut=best;
00410 *bestrOut=rr;
00411 *bestcOut=c;
00412 *bestdirOut=dir;
00413 }
00414
00415
00416 best=WORST;
00417 try=l->bestd - doubleGap;
00418 if (try > best)
00419 {
00420 best=try;
00421 dir='d';
00422 }
00423 try=l->bestm - gapOpen;
00424 if (try > best)
00425 {
00426 best=try;
00427 dir='m';
00428 }
00429 try=l->besti - gapExtend;
00430 if (try > best)
00431 {
00432 best=try;
00433 dir='i';
00434 }
00435 s->besti = best;
00436 s->backi = dir;
00437 if (best > bestbest)
00438 {
00439 bestbest=best;
00440 *bestbestOut=best;
00441 *bestrOut=rr;
00442 *bestcOut=c;
00443 *bestdirOut=dir;
00444 }
00445
00446 #ifdef DEBUG
00447 dump2L(cellc+c);
00448 #endif
00449
00450 d++;l++;u++;s++;
00451
00452 }
00453 #ifdef DEBUG
00454 printf("\n");
00455 #endif
00456
00457 }
00458 }
00459
00460
00461
00462 struct axt *axtAffine2Level(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss)
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487 {
00488 struct axt *axt=needMem(sizeof(struct axt));
00489
00490 char *q = query->dna;
00491 char *t = target->dna;
00492
00493 int Q= query->size;
00494 int T=target->size;
00495 int lv=Q+1;
00496 int lw=T+1;
00497
00498
00499 int r = 0;
00500 int c = 0;
00501 char dir=' ';
00502 int bestbest = WORST;
00503
00504 int k=0;
00505 int ksize = 0;
00506 int arrsize = 0;
00507 struct cell2L *cells = NULL;
00508 int ki = 0;
00509 int cmost = Q;
00510 int kmax = 0;
00511 int rr = 0;
00512 int nrows = 0;
00513 int bestr = 0;
00514 int bestc = 0;
00515 char bestdir = 0;
00516 int temp = 0;
00517
00518
00519 char *btq=NULL;
00520 char *btt=NULL;
00521
00522 ksize = (int) (-1 + sqrt(8*lw+1))/2;
00523 if (((ksize*(ksize+1))/2) < lw)
00524 {ksize++;}
00525 arrsize = (ksize+1) * lv;
00526 cells = needLargeMem(arrsize * sizeof(struct cell2L));
00527
00528 #ifdef DEBUG
00529 printf("\n k=%d \n ksize=%d \n arrsize=%d \n Q,lv=%d,%d T=%d \n \n",k,ksize,arrsize,Q,lv,T);
00530 #endif
00531
00532 axt->next = NULL;
00533 axt->qName = cloneString(query->name);
00534 axt->tName = cloneString(target->name);
00535 axt->qStrand ='+';
00536 axt->tStrand ='+';
00537 axt->frame = 0;
00538 axt->score=0;
00539 axt->qStart=0;
00540 axt->tStart=0;
00541 axt->qEnd=0;
00542 axt->tEnd=0;
00543 axt->symCount=0;
00544 axt->qSym=NULL;
00545 axt->tSym=NULL;
00546
00547 if ((Q==0) || (T==0))
00548 {
00549 axt->qSym=cloneString("");
00550 axt->tSym=cloneString("");
00551 freez(&cells);
00552 return axt;
00553 }
00554
00555
00556
00557
00558 cells[0].bestm=0;
00559 cells[0].bestd=WORST;
00560 cells[0].besti=WORST;
00561 cells[0].backm='x';
00562 cells[0].backd='x';
00563 cells[0].backi='x';
00564 #ifdef DEBUG
00565 dump2L(cells);
00566 #endif
00567
00568
00569 cells[1].bestm=WORST;
00570 cells[1].bestd=WORST;
00571 cells[1].besti=-ss->gapOpen;
00572 cells[1].backm='x';
00573 cells[1].backd='x';
00574 cells[1].backi='m';
00575 #ifdef DEBUG
00576 dump2L(cells+1);
00577 #endif
00578
00579
00580 for (c=2;c<lv;c++)
00581 {
00582 cells[c].bestm=WORST;
00583 cells[c].bestd=WORST;
00584 cells[c].besti=cells[c-1].besti-ss->gapExtend;
00585 cells[c].backm='x';
00586 cells[c].backd='x';
00587 cells[c].backi='i';
00588 #ifdef DEBUG
00589 dump2L(cells+c);
00590 #endif
00591 }
00592 #ifdef DEBUG
00593 printf("\n");
00594 printf("\n");
00595 #endif
00596
00597 k=ksize;
00598
00599 ki++;
00600
00601 r=1;
00602 while(1)
00603 {
00604 nrows = k;
00605 if (nrows > (lw-r)) {nrows=lw-r;}
00606 kmax = ki+nrows-1;
00607
00608 kForwardAffine(cells, ki, kmax, r-ki, cmost, lv, q, t, ss, &bestbest, &bestr, &bestc, &bestdir);
00609 #ifdef DEBUG
00610 printf("\n");
00611 #endif
00612
00613 r += nrows;
00614
00615 if (nrows == k)
00616 {
00617
00618
00619 memcpy(cells+ki*lv,cells+kmax*lv,sizeof(struct cell2L) *lv);
00620 }
00621
00622 if (r >= lw){break;}
00623
00624 ki++;
00625 k--;
00626 }
00627
00628 #ifdef DEBUG
00629 printf("\nFWD PASS DONE. bestbest=%d bestr=%d bestc=%d bestdir=%c \n\n",bestbest,bestr,bestc,bestdir);
00630 #endif
00631
00632
00633
00634
00635
00636
00637
00638 if (bestbest <= 0)
00639 {
00640 bestr=0;
00641 bestc=0;
00642
00643 }
00644
00645 r = bestr;
00646 c = bestc;
00647 dir = bestdir;
00648 cmost = c;
00649
00650 axt->qEnd=bestc;
00651 axt->tEnd=bestr;
00652
00653 temp = (2*ksize)+1;
00654 ki = (int)(temp-sqrt((temp*temp)-(8*r)))/2;
00655 rr = ((2*ksize*ki)+ki-(ki*ki))/2;
00656 kmax = ki+(r-rr);
00657 k = ksize - ki;
00658
00659
00660
00661
00662
00663
00664 #ifdef DEBUG
00665 printf("bestr=%d, bestc=%d, bestdir=%c k=%d, ki=%d, kmax=%d\n",bestr,bestc,bestdir,k,ki,kmax);
00666 #endif
00667
00668 kForwardAffine(cells, ki+1, kmax, rr-ki, cmost, lv, q, t, ss, &bestbest, &bestr, &bestc, &bestdir);
00669
00670 #ifdef DEBUG
00671 printf("\n(initial)BKWD PASS DONE. cmost=%d r=%d c=%d dir=%c \n\n",cmost,r,c,dir);
00672 #endif
00673
00674
00675
00676
00677
00678
00679 axt->symCount=0;
00680 axt->qSym = needLargeMem((Q+T+1)*sizeof(char));
00681 axt->tSym = needLargeMem((Q+T+1)*sizeof(char));
00682 btq=axt->qSym;
00683 btt=axt->tSym;
00684 while(1)
00685 {
00686 while(1)
00687 {
00688 #ifdef DEBUG
00689 printf("bt: r=%d, c=%d, dir=%c \n",r,c,dir);
00690 #endif
00691
00692
00693 if ((r==0) && (c==0)){break;}
00694 if (r<rr){break;}
00695 if (dir=='x'){errAbort("unexpected error backtracing");}
00696 if (dir=='s'){break;}
00697 if (dir=='m')
00698 {
00699 *btq++=q[c-1];
00700 *btt++=t[r-1];
00701 axt->symCount++;
00702 dir = cells[lv*(ki+r-rr)+c].backm;
00703 r--;
00704 c--;
00705 cmost--;
00706 }
00707 else
00708 {
00709 if (dir=='d')
00710 {
00711 *btq++='-';
00712 *btt++=t[r-1];
00713 axt->symCount++;
00714 dir = cells[lv*(ki+r-rr)+c].backd;
00715 r--;
00716 }
00717 else
00718 {
00719 *btq++=q[c-1];
00720 *btt++='-';
00721 axt->symCount++;
00722 dir = cells[lv*(ki+r-rr)+c].backi;
00723 c--;
00724 cmost--;
00725 }
00726 }
00727
00728 }
00729
00730
00731 ki--;
00732 k++;
00733 rr-=k;
00734 kmax = ki+k-1;
00735
00736
00737 if (ki < 0) {break;}
00738 if ((r==0)&&(c==0)) {break;}
00739 if (dir=='s') {break;}
00740
00741
00742
00743
00744
00745
00746 #ifdef DEBUG
00747 printf("bestr=%d, bestc=%d, bestdir=%c k=%d, ki=%d, kmax=%d\n",bestr,bestc,bestdir,k,ki,kmax);
00748 #endif
00749
00750
00751 kForwardAffine(cells, ki+1, kmax, rr-ki, cmost, lv, q, t, ss, &bestbest, &bestr, &bestc, &bestdir);
00752
00753 #ifdef DEBUG
00754 printf("\nBKWD PASS DONE. cmost=%d r=%d c=%d\n\n",cmost,r,c);
00755 #endif
00756
00757 }
00758
00759 axt->qStart=c;
00760 axt->tStart=r;
00761
00762
00763
00764 reverseBytes(axt->qSym,axt->symCount);
00765 reverseBytes(axt->tSym,axt->symCount);
00766 axt->qSym[axt->symCount]=0;
00767 axt->tSym[axt->symCount]=0;
00768
00769 axt->score=bestbest;
00770
00771
00772
00773
00774
00775
00776 freez(&cells);
00777
00778 return axt;
00779 }
00780
00781
00782