00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "common.h"
00016 #include "dnautil.h"
00017
00018 static char const rcsid[] = "$Id: dnautil.c,v 1.49 2007/03/14 04:54:55 kent Exp $";
00019
00020 struct codonTable
00021
00022 {
00023 DNA *codon;
00024 AA protCode;
00025 AA mitoCode;
00026 };
00027
00028 struct codonTable codonTable[] =
00029
00030 {
00031 {"ttt", 'F', 'F',},
00032 {"ttc", 'F', 'F',},
00033 {"tta", 'L', 'L',},
00034 {"ttg", 'L', 'L',},
00035
00036 {"tct", 'S', 'S',},
00037 {"tcc", 'S', 'S',},
00038 {"tca", 'S', 'S',},
00039 {"tcg", 'S', 'S',},
00040
00041 {"tat", 'Y', 'Y',},
00042 {"tac", 'Y', 'Y',},
00043 {"taa", 0, 0,},
00044 {"tag", 0, 0,},
00045
00046 {"tgt", 'C', 'C',},
00047 {"tgc", 'C', 'C',},
00048 {"tga", 0, 'W',},
00049 {"tgg", 'W', 'W',},
00050
00051
00052 {"ctt", 'L', 'L',},
00053 {"ctc", 'L', 'L',},
00054 {"cta", 'L', 'L',},
00055 {"ctg", 'L', 'L',},
00056
00057 {"cct", 'P', 'P',},
00058 {"ccc", 'P', 'P',},
00059 {"cca", 'P', 'P',},
00060 {"ccg", 'P', 'P',},
00061
00062 {"cat", 'H', 'H',},
00063 {"cac", 'H', 'H',},
00064 {"caa", 'Q', 'Q',},
00065 {"cag", 'Q', 'Q',},
00066
00067 {"cgt", 'R', 'R',},
00068 {"cgc", 'R', 'R',},
00069 {"cga", 'R', 'R',},
00070 {"cgg", 'R', 'R',},
00071
00072
00073 {"att", 'I', 'I',},
00074 {"atc", 'I', 'I',},
00075 {"ata", 'I', 'M',},
00076 {"atg", 'M', 'M',},
00077
00078 {"act", 'T', 'T',},
00079 {"acc", 'T', 'T',},
00080 {"aca", 'T', 'T',},
00081 {"acg", 'T', 'T',},
00082
00083 {"aat", 'N', 'N',},
00084 {"aac", 'N', 'N',},
00085 {"aaa", 'K', 'K',},
00086 {"aag", 'K', 'K',},
00087
00088 {"agt", 'S', 'S',},
00089 {"agc", 'S', 'S',},
00090 {"aga", 'R', 0,},
00091 {"agg", 'R', 0,},
00092
00093
00094 {"gtt", 'V', 'V',},
00095 {"gtc", 'V', 'V',},
00096 {"gta", 'V', 'V',},
00097 {"gtg", 'V', 'V',},
00098
00099 {"gct", 'A', 'A',},
00100 {"gcc", 'A', 'A',},
00101 {"gca", 'A', 'A',},
00102 {"gcg", 'A', 'A',},
00103
00104 {"gat", 'D', 'D',},
00105 {"gac", 'D', 'D',},
00106 {"gaa", 'E', 'E',},
00107 {"gag", 'E', 'E',},
00108
00109 {"ggt", 'G', 'G',},
00110 {"ggc", 'G', 'G',},
00111 {"gga", 'G', 'G',},
00112 {"ggg", 'G', 'G',},
00113 };
00114
00115
00116
00117
00118
00119
00120
00121 int ntVal[256];
00122 int ntValLower[256];
00123 int ntValUpper[256];
00124 int ntVal5[256];
00125 int ntValNoN[256];
00126 DNA valToNt[(N_BASE_VAL|MASKED_BASE_BIT)+1];
00127
00128
00129 int ntValMasked[256];
00130 DNA valToNtMasked[256];
00131
00132 static boolean inittedNtVal = FALSE;
00133
00134 static void initNtVal()
00135 {
00136 if (!inittedNtVal)
00137 {
00138 int i;
00139 for (i=0; i<ArraySize(ntVal); i++)
00140 {
00141 ntValUpper[i] = ntValLower[i] = ntVal[i] = -1;
00142 ntValNoN[i] = T_BASE_VAL;
00143 if (isspace(i) || isdigit(i))
00144 ntVal5[i] = ntValMasked[i] = -1;
00145 else
00146 {
00147 ntVal5[i] = N_BASE_VAL;
00148 ntValMasked[i] = (islower(i) ? (N_BASE_VAL|MASKED_BASE_BIT) : N_BASE_VAL);
00149 }
00150 }
00151 ntVal5['t'] = ntVal5['T'] = ntValNoN['t'] = ntValNoN['T'] = ntVal['t'] = ntVal['T'] =
00152 ntValLower['t'] = ntValUpper['T'] = T_BASE_VAL;
00153 ntVal5['u'] = ntVal5['U'] = ntValNoN['u'] = ntValNoN['U'] = ntVal['u'] = ntVal['U'] =
00154 ntValLower['u'] = ntValUpper['U'] = U_BASE_VAL;
00155 ntVal5['c'] = ntVal5['C'] = ntValNoN['c'] = ntValNoN['C'] = ntVal['c'] = ntVal['C'] =
00156 ntValLower['c'] = ntValUpper['C'] = C_BASE_VAL;
00157 ntVal5['a'] = ntVal5['A'] = ntValNoN['a'] = ntValNoN['A'] = ntVal['a'] = ntVal['A'] =
00158 ntValLower['a'] = ntValUpper['A'] = A_BASE_VAL;
00159 ntVal5['g'] = ntVal5['G'] = ntValNoN['g'] = ntValNoN['G'] = ntVal['g'] = ntVal['G'] =
00160 ntValLower['g'] = ntValUpper['G'] = G_BASE_VAL;
00161
00162 valToNt[T_BASE_VAL] = valToNt[T_BASE_VAL|MASKED_BASE_BIT] = 't';
00163 valToNt[C_BASE_VAL] = valToNt[C_BASE_VAL|MASKED_BASE_BIT] = 'c';
00164 valToNt[A_BASE_VAL] = valToNt[A_BASE_VAL|MASKED_BASE_BIT] = 'a';
00165 valToNt[G_BASE_VAL] = valToNt[G_BASE_VAL|MASKED_BASE_BIT] = 'g';
00166 valToNt[N_BASE_VAL] = valToNt[N_BASE_VAL|MASKED_BASE_BIT] = 'n';
00167
00168
00169 ntValMasked['T'] = T_BASE_VAL;
00170 ntValMasked['U'] = U_BASE_VAL;
00171 ntValMasked['C'] = C_BASE_VAL;
00172 ntValMasked['A'] = A_BASE_VAL;
00173 ntValMasked['G'] = G_BASE_VAL;
00174
00175 ntValMasked['t'] = T_BASE_VAL|MASKED_BASE_BIT;
00176 ntValMasked['u'] = U_BASE_VAL|MASKED_BASE_BIT;
00177 ntValMasked['c'] = C_BASE_VAL|MASKED_BASE_BIT;
00178 ntValMasked['a'] = A_BASE_VAL|MASKED_BASE_BIT;
00179 ntValMasked['g'] = G_BASE_VAL|MASKED_BASE_BIT;
00180
00181 valToNtMasked[T_BASE_VAL] = 'T';
00182 valToNtMasked[C_BASE_VAL] = 'C';
00183 valToNtMasked[A_BASE_VAL] = 'A';
00184 valToNtMasked[G_BASE_VAL] = 'G';
00185 valToNtMasked[N_BASE_VAL] = 'N';
00186
00187 valToNtMasked[T_BASE_VAL|MASKED_BASE_BIT] = 't';
00188 valToNtMasked[C_BASE_VAL|MASKED_BASE_BIT] = 'c';
00189 valToNtMasked[A_BASE_VAL|MASKED_BASE_BIT] = 'a';
00190 valToNtMasked[G_BASE_VAL|MASKED_BASE_BIT] = 'g';
00191 valToNtMasked[N_BASE_VAL|MASKED_BASE_BIT] = 'n';
00192
00193 inittedNtVal = TRUE;
00194 }
00195 }
00196
00197
00198
00199
00200 AA lookupCodon(DNA *dna)
00201 {
00202 int ix;
00203 int i;
00204 char c;
00205
00206 if (!inittedNtVal)
00207 initNtVal();
00208 ix = 0;
00209 for (i=0; i<3; ++i)
00210 {
00211 int bv = ntVal[(int)dna[i]];
00212 if (bv<0)
00213 return 'X';
00214 ix = (ix<<2) + bv;
00215 }
00216 c = codonTable[ix].protCode;
00217 return c;
00218 }
00219
00220 boolean isStopCodon(DNA *dna)
00221
00222 {
00223 return lookupCodon(dna) == 0;
00224 }
00225
00226 boolean isKozak(char *dna, int dnaSize, int pos)
00227
00228 {
00229 if (lookupCodon(dna+pos) != 'M')
00230 {
00231 return FALSE;
00232 }
00233 if (pos + 3 < dnaSize)
00234 {
00235 if (ntVal[(int)dna[pos+3]] == G_BASE_VAL)
00236 return TRUE;
00237 }
00238 if (pos >= 3)
00239 {
00240 int c = ntVal[(int)dna[pos-3]];
00241 if (c == A_BASE_VAL || c == G_BASE_VAL)
00242 return TRUE;
00243 }
00244 return FALSE;
00245 }
00246
00247
00248 boolean isReallyStopCodon(char *dna, boolean selenocysteine)
00249
00250
00251 {
00252 if (selenocysteine)
00253 {
00254
00255
00256 return lookupMitoCodon(dna) == 0;
00257 }
00258 else
00259 {
00260 return lookupCodon(dna) == 0;
00261 }
00262 }
00263
00264
00265
00266
00267
00268 AA lookupMitoCodon(DNA *dna)
00269 {
00270 int ix;
00271 int i;
00272 char c;
00273
00274 if (!inittedNtVal)
00275 initNtVal();
00276 ix = 0;
00277 for (i=0; i<3; ++i)
00278 {
00279 int bv = ntVal[(int)dna[i]];
00280 if (bv<0)
00281 return 'X';
00282 ix = (ix<<2) + bv;
00283 }
00284 c = codonTable[ix].mitoCode;
00285 c = toupper(c);
00286 return c;
00287 }
00288
00289 Codon codonVal(DNA *start)
00290
00291
00292 {
00293 int v1,v2,v3;
00294
00295 if ((v1 = ntVal[(int)start[0]]) < 0)
00296 return -1;
00297 if ((v2 = ntVal[(int)start[1]]) < 0)
00298 return -1;
00299 if ((v3 = ntVal[(int)start[2]]) < 0)
00300 return -1;
00301 return ((v1<<4) + (v2<<2) + v3);
00302 }
00303
00304 DNA *valToCodon(int val)
00305
00306 {
00307 assert(val >= 0 && val < 64);
00308 return codonTable[val].codon;
00309 }
00310
00311 void dnaTranslateSome(DNA *dna, char *out, int outSize)
00312
00313
00314 {
00315 int i;
00316 int dnaSize;
00317 int protSize = 0;
00318
00319 outSize -= 1;
00320 dnaSize = strlen(dna);
00321 for (i=0; i<dnaSize-2; i+=3)
00322 {
00323 if (protSize >= outSize)
00324 break;
00325 if ((out[protSize++] = lookupCodon(dna+i)) == 0)
00326 break;
00327 }
00328 out[protSize] = 0;
00329 }
00330
00331
00332
00333 char ntChars[256];
00334
00335 static void initNtChars()
00336 {
00337 static boolean initted = FALSE;
00338
00339 if (!initted)
00340 {
00341 zeroBytes(ntChars, sizeof(ntChars));
00342 ntChars['a'] = ntChars['A'] = 'a';
00343 ntChars['c'] = ntChars['C'] = 'c';
00344 ntChars['g'] = ntChars['G'] = 'g';
00345 ntChars['t'] = ntChars['T'] = 't';
00346 ntChars['n'] = ntChars['N'] = 'n';
00347 ntChars['u'] = ntChars['U'] = 'u';
00348 ntChars['-'] = 'n';
00349 initted = TRUE;
00350 }
00351 }
00352
00353 char ntMixedCaseChars[256];
00354
00355 static void initNtMixedCaseChars()
00356 {
00357 static boolean initted = FALSE;
00358
00359 if (!initted)
00360 {
00361 zeroBytes(ntMixedCaseChars, sizeof(ntMixedCaseChars));
00362 ntMixedCaseChars['a'] = 'a';
00363 ntMixedCaseChars['A'] = 'A';
00364 ntMixedCaseChars['c'] = 'c';
00365 ntMixedCaseChars['C'] = 'C';
00366 ntMixedCaseChars['g'] = 'g';
00367 ntMixedCaseChars['G'] = 'G';
00368 ntMixedCaseChars['t'] = 't';
00369 ntMixedCaseChars['T'] = 'T';
00370 ntMixedCaseChars['n'] = 'n';
00371 ntMixedCaseChars['N'] = 'N';
00372 ntMixedCaseChars['u'] = 'u';
00373 ntMixedCaseChars['U'] = 'U';
00374 ntMixedCaseChars['-'] = 'n';
00375 initted = TRUE;
00376 }
00377 }
00378
00379
00380 DNA ntCompTable[256];
00381 static boolean inittedCompTable = FALSE;
00382
00383 static void initNtCompTable()
00384 {
00385 zeroBytes(ntCompTable, sizeof(ntCompTable));
00386 ntCompTable[' '] = ' ';
00387 ntCompTable['-'] = '-';
00388 ntCompTable['='] = '=';
00389 ntCompTable['a'] = 't';
00390 ntCompTable['c'] = 'g';
00391 ntCompTable['g'] = 'c';
00392 ntCompTable['t'] = 'a';
00393 ntCompTable['u'] = 'a';
00394 ntCompTable['n'] = 'n';
00395 ntCompTable['-'] = '-';
00396 ntCompTable['.'] = '.';
00397 ntCompTable['A'] = 'T';
00398 ntCompTable['C'] = 'G';
00399 ntCompTable['G'] = 'C';
00400 ntCompTable['T'] = 'A';
00401 ntCompTable['U'] = 'A';
00402 ntCompTable['N'] = 'N';
00403 ntCompTable['R'] = 'Y';
00404 ntCompTable['Y'] = 'R';
00405 ntCompTable['M'] = 'K';
00406 ntCompTable['K'] = 'M';
00407 ntCompTable['S'] = 'S';
00408 ntCompTable['W'] = 'W';
00409 ntCompTable['V'] = 'B';
00410 ntCompTable['H'] = 'D';
00411 ntCompTable['D'] = 'H';
00412 ntCompTable['B'] = 'V';
00413 ntCompTable['X'] = 'N';
00414 ntCompTable['r'] = 'y';
00415 ntCompTable['y'] = 'r';
00416 ntCompTable['s'] = 's';
00417 ntCompTable['w'] = 'w';
00418 ntCompTable['m'] = 'k';
00419 ntCompTable['k'] = 'm';
00420 ntCompTable['v'] = 'b';
00421 ntCompTable['h'] = 'd';
00422 ntCompTable['d'] = 'h';
00423 ntCompTable['b'] = 'v';
00424 ntCompTable['x'] = 'n';
00425 ntCompTable['('] = ')';
00426 ntCompTable[')'] = '(';
00427 inittedCompTable = TRUE;
00428 }
00429
00430
00431 void complement(DNA *dna, long length)
00432 {
00433 int i;
00434
00435 if (!inittedCompTable) initNtCompTable();
00436 for (i=0; i<length; ++i)
00437 {
00438 *dna = ntCompTable[(int)*dna];
00439 ++dna;
00440 }
00441 }
00442
00443
00444
00445 void reverseComplement(DNA *dna, long length)
00446 {
00447 reverseBytes(dna, length);
00448 complement(dna, length);
00449 }
00450
00451
00452
00453 long reverseOffset(long offset, long arraySize)
00454 {
00455 return arraySize-1 - offset;
00456 }
00457
00458
00459
00460 void reverseIntRange(int *pStart, int *pEnd, int size)
00461 {
00462 int temp;
00463 temp = *pStart;
00464 *pStart = size - *pEnd;
00465 *pEnd = size - temp;
00466 }
00467
00468
00469
00470 void reverseUnsignedRange(unsigned *pStart, unsigned *pEnd, int size)
00471 {
00472 unsigned temp;
00473 temp = *pStart;
00474 *pStart = size - *pEnd;
00475 *pEnd = size - temp;
00476 }
00477
00478
00479
00480 void toRna(DNA *dna)
00481 {
00482 DNA c;
00483 for (;;)
00484 {
00485 c = *dna;
00486 if (c == 't')
00487 *dna = 'u';
00488 else if (c == 'T')
00489 *dna = 'U';
00490 else if (c == 0)
00491 break;
00492 ++dna;
00493 }
00494 }
00495
00496 char *skipIgnoringDash(char *a, int size, bool skipTrailingDash)
00497
00498
00499 {
00500 while (size > 0)
00501 {
00502 if (*a++ != '-')
00503 --size;
00504 }
00505 if (skipTrailingDash)
00506 while (*a == '-')
00507 ++a;
00508 return a;
00509 }
00510
00511 int countNonDash(char *a, int size)
00512
00513 {
00514 int count = 0;
00515 int i;
00516 for (i=0; i<size; ++i)
00517 if (a[i] != '-')
00518 ++count;
00519 return count;
00520 }
00521
00522 int nextPowerOfFour(long x)
00523
00524
00525
00526
00527 {
00528 int count = 1;
00529 while (x > 4)
00530 {
00531 count += 1;
00532 x >>= 2;
00533 }
00534 return count;
00535 }
00536
00537 long dnaOrAaFilteredSize(char *raw, char filter[256])
00538
00539 {
00540 char c;
00541 long count = 0;
00542 dnaUtilOpen();
00543 while ((c = *raw++) != 0)
00544 {
00545 if (filter[(int)c]) ++count;
00546 }
00547 return count;
00548 }
00549
00550 void dnaOrAaFilter(char *in, char *out, char filter[256])
00551
00552 {
00553 char c;
00554 dnaUtilOpen();
00555 while ((c = *in++) != 0)
00556 {
00557 if ((c = filter[(int)c]) != 0) *out++ = c;
00558 }
00559 *out++ = 0;
00560 }
00561
00562 long dnaFilteredSize(char *rawDna)
00563
00564 {
00565 return dnaOrAaFilteredSize(rawDna, ntChars);
00566 }
00567
00568 void dnaFilter(char *in, DNA *out)
00569
00570 {
00571 dnaOrAaFilter(in, out, ntChars);
00572 }
00573
00574 void dnaFilterToN(char *in, DNA *out)
00575
00576 {
00577 DNA c;
00578 initNtChars();
00579 while ((c = *in++) != 0)
00580 {
00581 if ((c = ntChars[(int)c]) != 0) *out++ = c;
00582 else *out++ = 'n';
00583 }
00584 *out++ = 0;
00585 }
00586
00587 void dnaMixedCaseFilter(char *in, DNA *out)
00588
00589 {
00590 dnaOrAaFilter(in, out, ntMixedCaseChars);
00591 }
00592
00593 long aaFilteredSize(char *raw)
00594
00595 {
00596 return dnaOrAaFilteredSize(raw, aaChars);
00597 }
00598
00599 void aaFilter(char *in, DNA *out)
00600
00601 {
00602 dnaOrAaFilter(in, out, aaChars);
00603 }
00604
00605 void upperToN(char *s, int size)
00606
00607 {
00608 char c;
00609 int i;
00610 for (i=0; i<size; ++i)
00611 {
00612 c = s[i];
00613 if (isupper(c))
00614 s[i] = 'n';
00615 }
00616 }
00617
00618 void lowerToN(char *s, int size)
00619
00620 {
00621 char c;
00622 int i;
00623 for (i=0; i<size; ++i)
00624 {
00625 c = s[i];
00626 if (islower(c))
00627 s[i] = 'N';
00628 }
00629 }
00630
00631
00632 void dnaBaseHistogram(DNA *dna, int dnaSize, int histogram[4])
00633
00634
00635 {
00636 int val;
00637 zeroBytes(histogram, 4*sizeof(int));
00638 while (--dnaSize >= 0)
00639 {
00640 if ((val = ntVal[(int)*dna++]) >= 0)
00641 ++histogram[val];
00642 }
00643 }
00644
00645 bits32 packDna16(DNA *in)
00646
00647 {
00648 bits32 out = 0;
00649 int count = 16;
00650 int bVal;
00651 while (--count >= 0)
00652 {
00653 bVal = ntValNoN[(int)*in++];
00654 out <<= 2;
00655 out += bVal;
00656 }
00657 return out;
00658 }
00659
00660 bits16 packDna8(DNA *in)
00661
00662 {
00663 bits16 out = 0;
00664 int count = 8;
00665 int bVal;
00666 while (--count >= 0)
00667 {
00668 bVal = ntValNoN[(int)*in++];
00669 out <<= 2;
00670 out += bVal;
00671 }
00672 return out;
00673 }
00674
00675 UBYTE packDna4(DNA *in)
00676
00677 {
00678 UBYTE out = 0;
00679 int count = 4;
00680 int bVal;
00681 while (--count >= 0)
00682 {
00683 bVal = ntValNoN[(int)*in++];
00684 out <<= 2;
00685 out += bVal;
00686 }
00687 return out;
00688 }
00689
00690 void unpackDna(bits32 *tiles, int tileCount, DNA *out)
00691
00692 {
00693 int i, j;
00694 bits32 tile;
00695
00696 for (i=0; i<tileCount; ++i)
00697 {
00698 tile = tiles[i];
00699 for (j=15; j>=0; --j)
00700 {
00701 out[j] = valToNt[tile & 0x3];
00702 tile >>= 2;
00703 }
00704 out += 16;
00705 }
00706 }
00707
00708 void unpackDna4(UBYTE *tiles, int byteCount, DNA *out)
00709
00710 {
00711 int i, j;
00712 UBYTE tile;
00713
00714 for (i=0; i<byteCount; ++i)
00715 {
00716 tile = tiles[i];
00717 for (j=3; j>=0; --j)
00718 {
00719 out[j] = valToNt[tile & 0x3];
00720 tile >>= 2;
00721 }
00722 out += 4;
00723 }
00724 }
00725
00726
00727
00728
00729 static void checkSizeTypes()
00730
00731 {
00732 assert(sizeof(UBYTE) == 1);
00733 assert(sizeof(WORD) == 2);
00734 assert(sizeof(bits32) == 4);
00735 assert(sizeof(bits16) == 2);
00736 }
00737
00738 int intronOrientationMinSize(DNA *iStart, DNA *iEnd, int minIntronSize)
00739
00740
00741
00742 {
00743 if (iEnd - iStart < minIntronSize)
00744 return 0;
00745 if (iStart[0] == 'g' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'g')
00746 {
00747 return 1;
00748 }
00749 else if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'c')
00750 {
00751 return -1;
00752 }
00753 else
00754 return 0;
00755 }
00756
00757 int intronOrientation(DNA *iStart, DNA *iEnd)
00758
00759
00760
00761 {
00762 return intronOrientationMinSize(iStart, iEnd, 32);
00763 }
00764
00765 int dnaScore2(DNA a, DNA b)
00766
00767 {
00768 if (a == 'n' || b == 'n') return 0;
00769 if (a == b) return 1;
00770 else return -1;
00771 }
00772
00773 int dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore,
00774 char ignore)
00775
00776 {
00777 int i;
00778 int score = 0;
00779 for (i=0; i<size; ++i)
00780 {
00781 char aa = a[i];
00782 char bb = b[i];
00783 if (aa == ignore || bb == ignore)
00784 continue;
00785 if (aa == bb)
00786 score += matchScore;
00787 else
00788 score += mismatchScore;
00789 }
00790 return score;
00791 }
00792
00793 int dnaScoreMatch(DNA *a, DNA *b, int size)
00794
00795
00796
00797 {
00798 return dnaOrAaScoreMatch(a, b, size, 1, -1, 'n');
00799 }
00800
00801 int aaScore2(AA a, AA b)
00802
00803 {
00804 if (a == 'X' || b == 'X') return 0;
00805 if (a == b) return 2;
00806 else return -1;
00807 }
00808
00809 int aaScoreMatch(AA *a, AA *b, int size)
00810
00811 {
00812 return dnaOrAaScoreMatch(a, b, size, 2, -1, 'X');
00813 }
00814
00815 void writeSeqWithBreaks(FILE *f, char *letters, int letterCount, int maxPerLine)
00816
00817 {
00818 int lettersLeft = letterCount;
00819 int lineSize;
00820 while (lettersLeft > 0)
00821 {
00822 lineSize = lettersLeft;
00823 if (lineSize > maxPerLine)
00824 lineSize = maxPerLine;
00825 mustWrite(f, letters, lineSize);
00826 fputc('\n', f);
00827 letters += lineSize;
00828 lettersLeft -= lineSize;
00829 }
00830 }
00831
00832 static int findTailPolyAMaybeMask(DNA *dna, int size, boolean doMask,
00833 boolean loose)
00834
00835
00836
00837 {
00838 int i;
00839 int score = 10;
00840 int bestScore = 10;
00841 int bestPos = -1;
00842 int trimSize = 0;
00843
00844 for (i=size-1; i>=0; --i)
00845 {
00846 DNA b = dna[i];
00847 if (b == 'n' || b == 'N')
00848 continue;
00849 if (score > 20) score = 20;
00850 if (b == 'a' || b == 'A')
00851 {
00852 score += 1;
00853 if (score >= bestScore)
00854 {
00855 bestScore = score;
00856 bestPos = i;
00857 }
00858 else if (loose && score >= (bestScore - 8))
00859 {
00860
00861 bestPos = i;
00862 }
00863 }
00864 else
00865 {
00866 score -= 10;
00867 }
00868 if (score < 0)
00869 {
00870 break;
00871 }
00872 }
00873 if (bestPos >= 0)
00874 {
00875 trimSize = size - bestPos - 2;
00876 if (trimSize > 0)
00877 {
00878 if (doMask)
00879 for (i=size - trimSize; i<size; ++i)
00880 dna[i] = 'n';
00881 }
00882 else
00883 trimSize = 0;
00884 }
00885 return trimSize;
00886 }
00887
00888 int tailPolyASizeLoose(DNA *dna, int size)
00889
00890
00891
00892 {
00893 return findTailPolyAMaybeMask(dna, size, FALSE, TRUE);
00894 }
00895
00896 int maskTailPolyA(DNA *dna, int size)
00897
00898
00899 {
00900 return findTailPolyAMaybeMask(dna, size, TRUE, FALSE);
00901 }
00902
00903 static int findHeadPolyTMaybeMask(DNA *dna, int size, boolean doMask,
00904 boolean loose)
00905
00906
00907
00908 {
00909 int i;
00910 int score = 10;
00911 int bestScore = 10;
00912 int bestPos = -1;
00913 int pastPoly = 0;
00914 int trimSize = 0;
00915
00916 for (i=0; i<size; ++i)
00917 {
00918 DNA b = dna[i];
00919 if (b == 'n' || b == 'N')
00920 continue;
00921 if (score > 20) score = 20;
00922 if (b == 't' || b == 'T')
00923 {
00924 score += 1;
00925 if (score >= bestScore)
00926 {
00927 bestScore = score;
00928 bestPos = i;
00929 }
00930 else if (loose && score >= (bestScore - 8))
00931 {
00932
00933 bestPos = i;
00934 }
00935 }
00936 else
00937 {
00938 score -= 10;
00939 }
00940 if (score < 0)
00941 {
00942 pastPoly = i;
00943 break;
00944 }
00945 }
00946 if (bestPos >= 0)
00947 {
00948 trimSize = bestPos+1 - 2;
00949 if (trimSize > 0)
00950 {
00951 if (doMask)
00952 memset(dna, 'n', trimSize);
00953 }
00954 else
00955 trimSize = 0;
00956 }
00957 return trimSize;
00958 }
00959
00960 int headPolyTSizeLoose(DNA *dna, int size)
00961
00962
00963
00964
00965 {
00966 return findHeadPolyTMaybeMask(dna, size, FALSE, TRUE);
00967 }
00968
00969 int maskHeadPolyT(DNA *dna, int size)
00970
00971
00972 {
00973 return findHeadPolyTMaybeMask(dna, size, TRUE, FALSE);
00974 }
00975
00976 boolean isDna(char *poly, int size)
00977
00978 {
00979 int i;
00980 int dnaCount = 0;
00981
00982 dnaUtilOpen();
00983 for (i=0; i<size; ++i)
00984 {
00985 if (ntChars[(int)poly[i]])
00986 dnaCount += 1;
00987 }
00988 return (dnaCount >= round(0.9 * size));
00989 }
00990
00991 boolean isAllDna(char *poly, int size)
00992
00993 {
00994 int i;
00995
00996 if (size <= 1)
00997 return FALSE;
00998 dnaUtilOpen();
00999 for (i=0; i<size-1; ++i)
01000 {
01001 if (ntChars[(int)poly[i]] == 0)
01002 return FALSE;
01003 }
01004 return TRUE;
01005 }
01006
01007
01008
01009
01010
01011 int aaVal[256];
01012 AA valToAa[20];
01013
01014 AA aaChars[256];
01015
01016 struct aminoAcidTable
01017
01018 {
01019 int ix;
01020 char letter;
01021 char abbreviation[3];
01022 char *name;
01023 };
01024
01025 struct aminoAcidTable aminoAcidTable[] =
01026 {
01027 {0, 'A', "ala", "alanine"},
01028 {1, 'C', "cys", "cysteine"},
01029 {2, 'D', "asp", "aspartic acid"},
01030 {3, 'E', "glu", "glutamic acid"},
01031 {4, 'F', "phe", "phenylalanine"},
01032 {5, 'G', "gly", "glycine"},
01033 {6, 'H', "his", "histidine"},
01034 {7, 'I', "ile", "isoleucine"},
01035 {8, 'K', "lys", "lysine"},
01036 {9, 'L', "leu", "leucine"},
01037 {10, 'M', "met", "methionine"},
01038 {11, 'N', "asn", "asparagine"},
01039 {12, 'P', "pro", "proline"},
01040 {13, 'Q', "gln", "glutamine"},
01041 {14, 'R', "arg", "arginine"},
01042 {15, 'S', "ser", "serine"},
01043 {16, 'T', "thr", "threonine"},
01044 {17, 'V', "val", "valine"},
01045 {18, 'W', "try", "tryptophan"},
01046 {19, 'Y', "tyr", "tyrosine"},
01047 };
01048
01049 static void initAaVal()
01050
01051 {
01052 int i;
01053 char c, lowc;
01054
01055 for (i=0; i<ArraySize(aaVal); ++i)
01056 aaVal[i] = -1;
01057 for (i=0; i<ArraySize(aminoAcidTable); ++i)
01058 {
01059 c = aminoAcidTable[i].letter;
01060 lowc = tolower(c);
01061 aaVal[(int)c] = aaVal[(int)lowc] = i;
01062 aaChars[(int)c] = aaChars[(int)lowc] = c;
01063 valToAa[i] = c;
01064 }
01065 aaChars['x'] = aaChars['X'] = 'X';
01066 }
01067
01068 void dnaUtilOpen()
01069
01070 {
01071 static boolean opened = FALSE;
01072 if (!opened)
01073 {
01074 checkSizeTypes();
01075 initNtVal();
01076 initAaVal();
01077 initNtChars();
01078 initNtMixedCaseChars();
01079 initNtCompTable();
01080 opened = TRUE;
01081 }
01082 }
01083