lib/dnautil.c

Go to the documentation of this file.
00001 /* Some stuff that you'll likely need in any program that works with
00002  * DNA.  Includes stuff for amino acids as well. 
00003  *
00004  * Assumes that DNA is stored as a character.
00005  * The DNA it generates will include the bases 
00006  * as lowercase tcag.  It will generally accept
00007  * uppercase as well, and also 'n' or 'N' or '-'
00008  * for unknown bases. 
00009  *
00010  * Amino acids are stored as single character upper case. 
00011  *
00012  * This file is copyright 2002 Jim Kent, but license is hereby
00013  * granted for all use - public, private or commercial. */
00014 
00015 #include "common.h"
00016 #include "dnautil.h"
00017 
00018 static char const rcsid[] = "$Id: dnautil.c,v 1.49 2007/03/14 04:54:55 kent Exp $";
00019 
00020 struct codonTable
00021 /* The dread codon table. */
00022     {
00023     DNA *codon;         /* Lower case. */
00024     AA protCode;        /* Upper case. The "Standard" code */
00025     AA mitoCode;        /* Upper case. Vertebrate Mitochondrial translations */
00026     };
00027 
00028 struct codonTable codonTable[] = 
00029 /* The master codon/protein table. */
00030 {
00031     {"ttt", 'F', 'F',},
00032     {"ttc", 'F', 'F',},
00033     {"tta", 'L', 'L',},
00034     {"ttg", 'L', 'L',},
00035 
00036     {"tct", 'S', 'S',},
00037     {"tcc", 'S', 'S',},
00038     {"tca", 'S', 'S',},
00039     {"tcg", 'S', 'S',},
00040 
00041     {"tat", 'Y', 'Y',},
00042     {"tac", 'Y', 'Y',},
00043     {"taa", 0, 0,},
00044     {"tag", 0, 0,},
00045 
00046     {"tgt", 'C', 'C',},
00047     {"tgc", 'C', 'C',},
00048     {"tga", 0, 'W',},
00049     {"tgg", 'W', 'W',},
00050 
00051 
00052     {"ctt", 'L', 'L',},
00053     {"ctc", 'L', 'L',},
00054     {"cta", 'L', 'L',},
00055     {"ctg", 'L', 'L',},
00056 
00057     {"cct", 'P', 'P',},
00058     {"ccc", 'P', 'P',},
00059     {"cca", 'P', 'P',},
00060     {"ccg", 'P', 'P',},
00061 
00062     {"cat", 'H', 'H',},
00063     {"cac", 'H', 'H',},
00064     {"caa", 'Q', 'Q',},
00065     {"cag", 'Q', 'Q',},
00066 
00067     {"cgt", 'R', 'R',},
00068     {"cgc", 'R', 'R',},
00069     {"cga", 'R', 'R',},
00070     {"cgg", 'R', 'R',},
00071 
00072 
00073     {"att", 'I', 'I',},
00074     {"atc", 'I', 'I',},
00075     {"ata", 'I', 'M',},
00076     {"atg", 'M', 'M',},
00077 
00078     {"act", 'T', 'T',},
00079     {"acc", 'T', 'T',},
00080     {"aca", 'T', 'T',},
00081     {"acg", 'T', 'T',},
00082 
00083     {"aat", 'N', 'N',},
00084     {"aac", 'N', 'N',},
00085     {"aaa", 'K', 'K',},
00086     {"aag", 'K', 'K',},
00087 
00088     {"agt", 'S', 'S',},
00089     {"agc", 'S', 'S',},
00090     {"aga", 'R', 0,},
00091     {"agg", 'R', 0,},
00092 
00093 
00094     {"gtt", 'V', 'V',},
00095     {"gtc", 'V', 'V',},
00096     {"gta", 'V', 'V',},
00097     {"gtg", 'V', 'V',},
00098 
00099     {"gct", 'A', 'A',},
00100     {"gcc", 'A', 'A',},
00101     {"gca", 'A', 'A',},
00102     {"gcg", 'A', 'A',},
00103 
00104     {"gat", 'D', 'D',},
00105     {"gac", 'D', 'D',},
00106     {"gaa", 'E', 'E',},
00107     {"gag", 'E', 'E',},
00108 
00109     {"ggt", 'G', 'G',},
00110     {"ggc", 'G', 'G',},
00111     {"gga", 'G', 'G',},
00112     {"ggg", 'G', 'G',},
00113 };
00114 
00115 /* A table that gives values 0 for t
00116                              1 for c
00117                              2 for a
00118                              3 for g
00119  * (which is order aa's are in biochemistry codon tables)
00120  * and gives -1 for all others. */
00121 int ntVal[256];
00122 int ntValLower[256];    /* NT values only for lower case. */
00123 int ntValUpper[256];    /* NT values only for upper case. */
00124 int ntVal5[256];
00125 int ntValNoN[256]; /* Like ntVal, but with T_BASE_VAL in place of -1 for nonexistent ones. */
00126 DNA valToNt[(N_BASE_VAL|MASKED_BASE_BIT)+1];
00127 
00128 /* convert tables for bit-4 indicating masked */
00129 int ntValMasked[256];
00130 DNA valToNtMasked[256];
00131 
00132 static boolean inittedNtVal = FALSE;
00133 
00134 static void initNtVal()
00135 {
00136 if (!inittedNtVal)
00137     {
00138     int i;
00139     for (i=0; i<ArraySize(ntVal); i++)
00140         {
00141         ntValUpper[i] = ntValLower[i] = ntVal[i] = -1;
00142         ntValNoN[i] = T_BASE_VAL;
00143         if (isspace(i) || isdigit(i))
00144             ntVal5[i] = ntValMasked[i] = -1;
00145         else
00146             {
00147             ntVal5[i] = N_BASE_VAL;
00148             ntValMasked[i] = (islower(i) ? (N_BASE_VAL|MASKED_BASE_BIT) : N_BASE_VAL);
00149             }
00150         }
00151     ntVal5['t'] = ntVal5['T'] = ntValNoN['t'] = ntValNoN['T'] = ntVal['t'] = ntVal['T'] = 
00152         ntValLower['t'] = ntValUpper['T'] = T_BASE_VAL;
00153     ntVal5['u'] = ntVal5['U'] = ntValNoN['u'] = ntValNoN['U'] = ntVal['u'] = ntVal['U'] = 
00154         ntValLower['u'] = ntValUpper['U'] = U_BASE_VAL;
00155     ntVal5['c'] = ntVal5['C'] = ntValNoN['c'] = ntValNoN['C'] = ntVal['c'] = ntVal['C'] = 
00156         ntValLower['c'] = ntValUpper['C'] = C_BASE_VAL;
00157     ntVal5['a'] = ntVal5['A'] = ntValNoN['a'] = ntValNoN['A'] = ntVal['a'] = ntVal['A'] = 
00158         ntValLower['a'] = ntValUpper['A'] = A_BASE_VAL;
00159     ntVal5['g'] = ntVal5['G'] = ntValNoN['g'] = ntValNoN['G'] = ntVal['g'] = ntVal['G'] = 
00160         ntValLower['g'] = ntValUpper['G'] = G_BASE_VAL;
00161 
00162     valToNt[T_BASE_VAL] = valToNt[T_BASE_VAL|MASKED_BASE_BIT] = 't';
00163     valToNt[C_BASE_VAL] = valToNt[C_BASE_VAL|MASKED_BASE_BIT] = 'c';
00164     valToNt[A_BASE_VAL] = valToNt[A_BASE_VAL|MASKED_BASE_BIT] = 'a';
00165     valToNt[G_BASE_VAL] = valToNt[G_BASE_VAL|MASKED_BASE_BIT] = 'g';
00166     valToNt[N_BASE_VAL] = valToNt[N_BASE_VAL|MASKED_BASE_BIT] = 'n';
00167 
00168     /* masked values */
00169     ntValMasked['T'] = T_BASE_VAL;
00170     ntValMasked['U'] = U_BASE_VAL;
00171     ntValMasked['C'] = C_BASE_VAL;
00172     ntValMasked['A'] = A_BASE_VAL;
00173     ntValMasked['G'] = G_BASE_VAL;
00174 
00175     ntValMasked['t'] = T_BASE_VAL|MASKED_BASE_BIT;
00176     ntValMasked['u'] = U_BASE_VAL|MASKED_BASE_BIT;
00177     ntValMasked['c'] = C_BASE_VAL|MASKED_BASE_BIT;
00178     ntValMasked['a'] = A_BASE_VAL|MASKED_BASE_BIT;
00179     ntValMasked['g'] = G_BASE_VAL|MASKED_BASE_BIT;
00180 
00181     valToNtMasked[T_BASE_VAL] = 'T';
00182     valToNtMasked[C_BASE_VAL] = 'C';
00183     valToNtMasked[A_BASE_VAL] = 'A';
00184     valToNtMasked[G_BASE_VAL] = 'G';
00185     valToNtMasked[N_BASE_VAL] = 'N';
00186 
00187     valToNtMasked[T_BASE_VAL|MASKED_BASE_BIT] = 't';
00188     valToNtMasked[C_BASE_VAL|MASKED_BASE_BIT] = 'c';
00189     valToNtMasked[A_BASE_VAL|MASKED_BASE_BIT] = 'a';
00190     valToNtMasked[G_BASE_VAL|MASKED_BASE_BIT] = 'g';
00191     valToNtMasked[N_BASE_VAL|MASKED_BASE_BIT] = 'n';
00192 
00193     inittedNtVal = TRUE;
00194     }
00195 }
00196 
00197 /* Returns one letter code for protein, 
00198  * 0 for stop codon or X for bad input,
00199  * The "Standard" Code */
00200 AA lookupCodon(DNA *dna)
00201 {
00202 int ix;
00203 int i;
00204 char c;
00205 
00206 if (!inittedNtVal)
00207     initNtVal();
00208 ix = 0;
00209 for (i=0; i<3; ++i)
00210     {
00211     int bv = ntVal[(int)dna[i]];
00212     if (bv<0)
00213         return 'X';
00214     ix = (ix<<2) + bv;
00215     }
00216 c = codonTable[ix].protCode;
00217 return c;
00218 }
00219 
00220 boolean isStopCodon(DNA *dna)
00221 /* Return TRUE if it's a stop codon. */
00222 {
00223 return lookupCodon(dna) == 0;
00224 }
00225 
00226 boolean isKozak(char *dna, int dnaSize, int pos)
00227 /* Return TRUE if it's a Kozak compatible start. */
00228 {
00229 if (lookupCodon(dna+pos) != 'M')
00230    {
00231    return FALSE;
00232    }
00233 if (pos + 3 < dnaSize)
00234     {
00235     if (ntVal[(int)dna[pos+3]] == G_BASE_VAL)
00236         return TRUE;
00237     }
00238 if (pos >= 3)
00239     {
00240     int c = ntVal[(int)dna[pos-3]];
00241     if (c == A_BASE_VAL || c == G_BASE_VAL)
00242         return TRUE;
00243     }
00244 return FALSE;
00245 }
00246 
00247 
00248 boolean isReallyStopCodon(char *dna, boolean selenocysteine)
00249 /* Return TRUE if it's really a stop codon, even considering
00250  * possibilility of selenocysteine. */
00251 {
00252 if (selenocysteine)
00253     {
00254     /* Luckily the mitochondria *also* replaces TGA with 
00255      * something else, even though it isn't selenocysteine */
00256     return lookupMitoCodon(dna) == 0;
00257     }
00258 else
00259     {
00260     return lookupCodon(dna) == 0;
00261     }
00262 }
00263 
00264 
00265 /* Returns one letter code for protein, 
00266  * 0 for stop codon or X for bad input,
00267  * Vertebrate Mitochondrial Code */
00268 AA lookupMitoCodon(DNA *dna)
00269 {
00270 int ix;
00271 int i;
00272 char c;
00273 
00274 if (!inittedNtVal)
00275     initNtVal();
00276 ix = 0;
00277 for (i=0; i<3; ++i)
00278     {
00279     int bv = ntVal[(int)dna[i]];
00280     if (bv<0)
00281         return 'X';
00282     ix = (ix<<2) + bv;
00283     }
00284 c = codonTable[ix].mitoCode;
00285 c = toupper(c);
00286 return c;
00287 }
00288 
00289 Codon codonVal(DNA *start)
00290 /* Return value from 0-63 of codon starting at start. 
00291  * Returns -1 if not a codon. */
00292 {
00293 int v1,v2,v3;
00294 
00295 if ((v1 = ntVal[(int)start[0]]) < 0)
00296     return -1;
00297 if ((v2 = ntVal[(int)start[1]]) < 0)
00298     return -1;
00299 if ((v3 = ntVal[(int)start[2]]) < 0)
00300     return -1;
00301 return ((v1<<4) + (v2<<2) + v3);
00302 }
00303 
00304 DNA *valToCodon(int val)
00305 /* Return  codon corresponding to val (0-63) */
00306 {
00307 assert(val >= 0 && val < 64);
00308 return codonTable[val].codon;
00309 }
00310 
00311 void dnaTranslateSome(DNA *dna, char *out, int outSize)
00312 /* Translate DNA upto a stop codon or until outSize-1 amino acids, 
00313  * whichever comes first. Output will be zero terminated. */
00314 {
00315 int i;
00316 int dnaSize;
00317 int protSize = 0;
00318 
00319 outSize -= 1;  /* Room for terminal zero */
00320 dnaSize = strlen(dna);
00321 for (i=0; i<dnaSize-2; i+=3)
00322     {
00323     if (protSize >= outSize)
00324         break;
00325     if ((out[protSize++] = lookupCodon(dna+i)) == 0)
00326         break;
00327     }
00328 out[protSize] = 0;
00329 }
00330 
00331 /* A little array to help us decide if a character is a 
00332  * nucleotide, and if so convert it to lower case. */
00333 char ntChars[256];
00334 
00335 static void initNtChars()
00336 {
00337 static boolean initted = FALSE;
00338 
00339 if (!initted)
00340     {
00341     zeroBytes(ntChars, sizeof(ntChars));
00342     ntChars['a'] = ntChars['A'] = 'a';
00343     ntChars['c'] = ntChars['C'] = 'c';
00344     ntChars['g'] = ntChars['G'] = 'g';
00345     ntChars['t'] = ntChars['T'] = 't';
00346     ntChars['n'] = ntChars['N'] = 'n';
00347     ntChars['u'] = ntChars['U'] = 'u';
00348     ntChars['-'] = 'n';
00349     initted = TRUE;
00350     }
00351 }
00352 
00353 char ntMixedCaseChars[256];
00354 
00355 static void initNtMixedCaseChars()
00356 {
00357 static boolean initted = FALSE;
00358 
00359 if (!initted)
00360     {
00361     zeroBytes(ntMixedCaseChars, sizeof(ntMixedCaseChars));
00362     ntMixedCaseChars['a'] = 'a';
00363     ntMixedCaseChars['A'] = 'A';
00364     ntMixedCaseChars['c'] = 'c';
00365     ntMixedCaseChars['C'] = 'C';
00366     ntMixedCaseChars['g'] = 'g';
00367     ntMixedCaseChars['G'] = 'G';
00368     ntMixedCaseChars['t'] = 't';
00369     ntMixedCaseChars['T'] = 'T';
00370     ntMixedCaseChars['n'] = 'n';
00371     ntMixedCaseChars['N'] = 'N';
00372     ntMixedCaseChars['u'] = 'u';
00373     ntMixedCaseChars['U'] = 'U';
00374     ntMixedCaseChars['-'] = 'n';
00375     initted = TRUE;
00376     }
00377 }
00378 
00379 /* Another array to help us do complement of DNA */
00380 DNA ntCompTable[256];
00381 static boolean inittedCompTable = FALSE;
00382 
00383 static void initNtCompTable()
00384 {
00385 zeroBytes(ntCompTable, sizeof(ntCompTable));
00386 ntCompTable[' '] = ' ';
00387 ntCompTable['-'] = '-';
00388 ntCompTable['='] = '=';
00389 ntCompTable['a'] = 't';
00390 ntCompTable['c'] = 'g';
00391 ntCompTable['g'] = 'c';
00392 ntCompTable['t'] = 'a';
00393 ntCompTable['u'] = 'a';
00394 ntCompTable['n'] = 'n';
00395 ntCompTable['-'] = '-';
00396 ntCompTable['.'] = '.';
00397 ntCompTable['A'] = 'T';
00398 ntCompTable['C'] = 'G';
00399 ntCompTable['G'] = 'C';
00400 ntCompTable['T'] = 'A';
00401 ntCompTable['U'] = 'A';
00402 ntCompTable['N'] = 'N';
00403 ntCompTable['R'] = 'Y';
00404 ntCompTable['Y'] = 'R';
00405 ntCompTable['M'] = 'K';
00406 ntCompTable['K'] = 'M';
00407 ntCompTable['S'] = 'S';
00408 ntCompTable['W'] = 'W';
00409 ntCompTable['V'] = 'B';
00410 ntCompTable['H'] = 'D';
00411 ntCompTable['D'] = 'H';
00412 ntCompTable['B'] = 'V';
00413 ntCompTable['X'] = 'N';
00414 ntCompTable['r'] = 'y';
00415 ntCompTable['y'] = 'r';
00416 ntCompTable['s'] = 's';
00417 ntCompTable['w'] = 'w';
00418 ntCompTable['m'] = 'k';
00419 ntCompTable['k'] = 'm';
00420 ntCompTable['v'] = 'b';
00421 ntCompTable['h'] = 'd';
00422 ntCompTable['d'] = 'h';
00423 ntCompTable['b'] = 'v';
00424 ntCompTable['x'] = 'n';
00425 ntCompTable['('] = ')';
00426 ntCompTable[')'] = '(';
00427 inittedCompTable = TRUE;
00428 }
00429 
00430 /* Complement DNA (not reverse). */
00431 void complement(DNA *dna, long length)
00432 {
00433 int i;
00434 
00435 if (!inittedCompTable) initNtCompTable();
00436 for (i=0; i<length; ++i)
00437     {
00438     *dna = ntCompTable[(int)*dna];
00439     ++dna;
00440     }
00441 }
00442 
00443 
00444 /* Reverse complement DNA. */
00445 void reverseComplement(DNA *dna, long length)
00446 {
00447 reverseBytes(dna, length);
00448 complement(dna, length);
00449 }
00450 
00451 /* Reverse offset - return what will be offset (0 based) to
00452  * same member of array after array is reversed. */
00453 long reverseOffset(long offset, long arraySize)
00454 {
00455 return arraySize-1 - offset;
00456 }
00457 
00458 /* Switch start/end (zero based half open) coordinates
00459  * to opposite strand. */
00460 void reverseIntRange(int *pStart, int *pEnd, int size)
00461 {
00462 int temp;
00463 temp = *pStart;
00464 *pStart = size - *pEnd;
00465 *pEnd = size - temp;
00466 }
00467 
00468 /* Switch start/end (zero based half open) coordinates
00469  * to opposite strand. */
00470 void reverseUnsignedRange(unsigned *pStart, unsigned *pEnd, int size)
00471 {
00472 unsigned temp;
00473 temp = *pStart;
00474 *pStart = size - *pEnd;
00475 *pEnd = size - temp;
00476 }
00477 
00478 
00479 /* Convert T's to U's */
00480 void toRna(DNA *dna)
00481 {
00482 DNA c;
00483 for (;;)
00484     {
00485     c = *dna;
00486     if (c == 't')
00487         *dna = 'u';
00488     else if (c == 'T')
00489         *dna = 'U';
00490     else if (c == 0)
00491         break;
00492     ++dna;
00493     }
00494 }
00495 
00496 char *skipIgnoringDash(char *a, int size, bool skipTrailingDash)
00497 /* Count size number of characters, and any 
00498  * dash characters. */
00499 {
00500 while (size > 0)
00501     {
00502     if (*a++ != '-')
00503         --size;
00504     }
00505 if (skipTrailingDash)
00506     while (*a == '-')
00507        ++a;
00508 return a;
00509 }
00510 
00511 int countNonDash(char *a, int size)
00512 /* Count number of non-dash characters. */
00513 {
00514 int count = 0;
00515 int i;
00516 for (i=0; i<size; ++i)
00517     if (a[i] != '-') 
00518         ++count;
00519 return count;
00520 }
00521 
00522 int nextPowerOfFour(long x)
00523 /* Return next power of four that would be greater or equal to x.
00524  * For instance if x < 4, return 1, if x < 16 return 2.... 
00525  * (From biological point of view how many bases are needed to
00526  * code this number.) */
00527 {
00528 int count = 1;
00529 while (x > 4)
00530     {
00531     count += 1;
00532     x >>= 2;
00533     }
00534 return count;
00535 }
00536 
00537 long dnaOrAaFilteredSize(char *raw, char filter[256])
00538 /* Return how long DNA will be after non-DNA is filtered out. */
00539 {
00540 char c;
00541 long count = 0;
00542 dnaUtilOpen();
00543 while ((c = *raw++) != 0)
00544     {
00545     if (filter[(int)c]) ++count;
00546     }
00547 return count;
00548 }
00549 
00550 void dnaOrAaFilter(char *in, char *out, char filter[256])
00551 /* Run chars through filter. */
00552 {
00553 char c;
00554 dnaUtilOpen();
00555 while ((c = *in++) != 0)
00556     {
00557     if ((c = filter[(int)c]) != 0) *out++ = c;
00558     }
00559 *out++ = 0;
00560 }
00561 
00562 long dnaFilteredSize(char *rawDna)
00563 /* Return how long DNA will be after non-DNA is filtered out. */
00564 {
00565 return dnaOrAaFilteredSize(rawDna, ntChars);
00566 }
00567 
00568 void dnaFilter(char *in, DNA *out)
00569 /* Filter out non-DNA characters and change to lower case. */
00570 {
00571 dnaOrAaFilter(in, out, ntChars);
00572 }
00573 
00574 void dnaFilterToN(char *in, DNA *out)
00575 /* Change all non-DNA characters to N. */
00576 {
00577 DNA c;
00578 initNtChars();
00579 while ((c = *in++) != 0)
00580     {
00581     if ((c = ntChars[(int)c]) != 0) *out++ = c;
00582     else *out++ = 'n';
00583     }
00584 *out++ = 0;
00585 }
00586 
00587 void dnaMixedCaseFilter(char *in, DNA *out)
00588 /* Filter out non-DNA characters but leave case intact. */
00589 {
00590 dnaOrAaFilter(in, out, ntMixedCaseChars);
00591 }
00592 
00593 long aaFilteredSize(char *raw)
00594 /* Return how long aa will be after non-aa chars is filtered out. */
00595 {
00596 return dnaOrAaFilteredSize(raw, aaChars);
00597 }
00598 
00599 void aaFilter(char *in, DNA *out)
00600 /* Filter out non-aa characters and change to upper case. */
00601 {
00602 dnaOrAaFilter(in, out, aaChars);
00603 }
00604 
00605 void upperToN(char *s, int size)
00606 /* Turn upper case letters to N's. */
00607 {
00608 char c;
00609 int i;
00610 for (i=0; i<size; ++i)
00611     {
00612     c = s[i];
00613     if (isupper(c))
00614         s[i] = 'n';
00615     }
00616 }
00617 
00618 void lowerToN(char *s, int size)
00619 /* Turn lower case letters to N's. */
00620 {
00621 char c;
00622 int i;
00623 for (i=0; i<size; ++i)
00624     {
00625     c = s[i];
00626     if (islower(c))
00627         s[i] = 'N';
00628     }
00629 }
00630 
00631 
00632 void dnaBaseHistogram(DNA *dna, int dnaSize, int histogram[4])
00633 /* Count up frequency of occurance of each base and store 
00634  * results in histogram. */
00635 {
00636 int val;
00637 zeroBytes(histogram, 4*sizeof(int));
00638 while (--dnaSize >= 0)
00639     {
00640     if ((val = ntVal[(int)*dna++]) >= 0)
00641         ++histogram[val];
00642     }
00643 }
00644 
00645 bits32 packDna16(DNA *in)
00646 /* pack 16 bases into a word */
00647 {
00648 bits32 out = 0;
00649 int count = 16;
00650 int bVal;
00651 while (--count >= 0)
00652     {
00653     bVal = ntValNoN[(int)*in++];
00654     out <<= 2;
00655     out += bVal;
00656     }
00657 return out;
00658 }
00659 
00660 bits16 packDna8(DNA *in)
00661 /* Pack 8 bases into a short word */
00662 {
00663 bits16 out = 0;
00664 int count = 8;
00665 int bVal;
00666 while (--count >= 0)
00667     {
00668     bVal = ntValNoN[(int)*in++];
00669     out <<= 2;
00670     out += bVal;
00671     }
00672 return out;
00673 }
00674 
00675 UBYTE packDna4(DNA *in)
00676 /* Pack 4 bases into a UBYTE */
00677 {
00678 UBYTE out = 0;
00679 int count = 4;
00680 int bVal;
00681 while (--count >= 0)
00682     {
00683     bVal = ntValNoN[(int)*in++];
00684     out <<= 2;
00685     out += bVal;
00686     }
00687 return out;
00688 }
00689 
00690 void unpackDna(bits32 *tiles, int tileCount, DNA *out)
00691 /* Unpack DNA. Expands to 16x tileCount in output. */
00692 {
00693 int i, j;
00694 bits32 tile;
00695 
00696 for (i=0; i<tileCount; ++i)
00697     {
00698     tile = tiles[i];
00699     for (j=15; j>=0; --j)
00700         {
00701         out[j] = valToNt[tile & 0x3];
00702         tile >>= 2;
00703         }
00704     out += 16;
00705     }
00706 }
00707 
00708 void unpackDna4(UBYTE *tiles, int byteCount, DNA *out)
00709 /* Unpack DNA. Expands to 4x byteCount in output. */
00710 {
00711 int i, j;
00712 UBYTE tile;
00713 
00714 for (i=0; i<byteCount; ++i)
00715     {
00716     tile = tiles[i];
00717     for (j=3; j>=0; --j)
00718         {
00719         out[j] = valToNt[tile & 0x3];
00720         tile >>= 2;
00721         }
00722     out += 4;
00723     }
00724 }
00725 
00726 
00727 
00728 
00729 static void checkSizeTypes()
00730 /* Make sure that some of our predefined types are the right size. */
00731 {
00732 assert(sizeof(UBYTE) == 1);
00733 assert(sizeof(WORD) == 2);
00734 assert(sizeof(bits32) == 4);
00735 assert(sizeof(bits16) == 2);
00736 }
00737 
00738 int intronOrientationMinSize(DNA *iStart, DNA *iEnd, int minIntronSize)
00739 /* Given a gap in genome from iStart to iEnd, return 
00740  * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
00741  * intron.  Assumes DNA is lower cased. */
00742 {
00743 if (iEnd - iStart < minIntronSize)
00744     return 0;
00745 if (iStart[0] == 'g' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'g')
00746     {
00747     return 1;
00748     }
00749 else if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'c')
00750     {
00751     return -1;
00752     }
00753 else
00754     return 0;
00755 }
00756 
00757 int intronOrientation(DNA *iStart, DNA *iEnd)
00758 /* Given a gap in genome from iStart to iEnd, return 
00759  * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
00760  * intron.  Assumes DNA is lower cased. */
00761 {
00762 return intronOrientationMinSize(iStart, iEnd, 32);
00763 }
00764 
00765 int dnaScore2(DNA a, DNA b)
00766 /* Score match between two bases (relatively crudely). */
00767 {
00768 if (a == 'n' || b == 'n') return 0;
00769 if (a == b) return 1;
00770 else return -1;
00771 }
00772 
00773 int  dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore, 
00774         char ignore)
00775 /* Compare two sequences (without inserts or deletions) and score. */
00776 {
00777 int i;
00778 int score = 0;
00779 for (i=0; i<size; ++i)
00780     {
00781     char aa = a[i];
00782     char bb = b[i];
00783     if (aa == ignore || bb == ignore)
00784         continue;
00785     if (aa == bb)
00786         score += matchScore;
00787     else
00788         score += mismatchScore;
00789     }
00790 return score;
00791 }
00792 
00793 int dnaScoreMatch(DNA *a, DNA *b, int size)
00794 /* Compare two pieces of DNA base by base. Total mismatches are
00795  * subtracted from total matches and returned as score. 'N's 
00796  * neither hurt nor help score. */
00797 {
00798 return dnaOrAaScoreMatch(a, b, size, 1, -1, 'n');
00799 }
00800 
00801 int aaScore2(AA a, AA b)
00802 /* Score match between two amino acids (relatively crudely). */
00803 {
00804 if (a == 'X' || b == 'X') return 0;
00805 if (a == b) return 2;
00806 else return -1;
00807 }
00808 
00809 int aaScoreMatch(AA *a, AA *b, int size)
00810 /* Compare two peptides aa by aa. */
00811 {
00812 return dnaOrAaScoreMatch(a, b, size, 2, -1, 'X');
00813 }
00814 
00815 void writeSeqWithBreaks(FILE *f, char *letters, int letterCount, int maxPerLine)
00816 /* Write out letters with newlines every maxLine. */
00817 {
00818 int lettersLeft = letterCount;
00819 int lineSize;
00820 while (lettersLeft > 0)
00821     {
00822     lineSize = lettersLeft;
00823     if (lineSize > maxPerLine)
00824         lineSize = maxPerLine;
00825     mustWrite(f, letters, lineSize);
00826     fputc('\n', f);
00827     letters += lineSize;
00828     lettersLeft -= lineSize;
00829     }
00830 }
00831 
00832 static int findTailPolyAMaybeMask(DNA *dna, int size, boolean doMask,
00833                                   boolean loose)
00834 /* Identify PolyA at end; mask to 'n' if specified.  This allows a few 
00835  * non-A's as noise to be trimmed too.  Returns number of bases trimmed.  
00836  * Leaves first two bases of PolyA in case there's a taa stop codon. */
00837 {
00838 int i;
00839 int score = 10;
00840 int bestScore = 10;
00841 int bestPos = -1;
00842 int trimSize = 0;
00843 
00844 for (i=size-1; i>=0; --i)
00845     {
00846     DNA b = dna[i];
00847     if (b == 'n' || b == 'N')
00848         continue;
00849     if (score > 20) score = 20;
00850     if (b == 'a' || b == 'A')
00851         {
00852         score += 1;
00853         if (score >= bestScore)
00854             {
00855             bestScore = score;
00856             bestPos = i;
00857             }
00858         else if (loose && score >= (bestScore - 8))
00859             {
00860             /* If loose, keep extending even if score isn't back up to best. */
00861             bestPos = i;
00862             }
00863         }
00864     else
00865         {
00866         score -= 10;
00867         }
00868     if (score < 0)
00869         {
00870         break;
00871         }
00872     }
00873 if (bestPos >= 0)
00874     {
00875     trimSize = size - bestPos - 2;      // Leave two for aa in taa stop codon
00876     if (trimSize > 0)
00877         {
00878         if (doMask)
00879             for (i=size - trimSize; i<size; ++i)
00880                 dna[i] = 'n';
00881         }
00882     else
00883         trimSize = 0;
00884     }
00885 return trimSize;
00886 }
00887 
00888 int tailPolyASizeLoose(DNA *dna, int size)
00889 /* Return size of PolyA at end (if present).  This allows a few non-A's as 
00890  * noise to be trimmed too, but skips first two aa for taa stop codon.  
00891  * It is less conservative in extending the polyA region than maskTailPolyA. */
00892 {
00893 return findTailPolyAMaybeMask(dna, size, FALSE, TRUE);
00894 }
00895 
00896 int maskTailPolyA(DNA *dna, int size)
00897 /* Convert PolyA at end to n.  This allows a few non-A's as noise to be 
00898  * trimmed too.  Returns number of bases trimmed.  Leaves very last a. */
00899 {
00900 return findTailPolyAMaybeMask(dna, size, TRUE, FALSE);
00901 }
00902 
00903 static int findHeadPolyTMaybeMask(DNA *dna, int size, boolean doMask,
00904                                   boolean loose)
00905 /* Return size of PolyT at start (if present); mask to 'n' if specified.  
00906  * This allows a few non-T's as noise to be trimmed too, but skips last
00907  * two tt for revcomp'd taa stop codon. */
00908 {
00909 int i;
00910 int score = 10;
00911 int bestScore = 10;
00912 int bestPos = -1;
00913 int pastPoly = 0;
00914 int trimSize = 0;
00915 
00916 for (i=0; i<size; ++i)
00917     {
00918     DNA b = dna[i];
00919     if (b == 'n' || b == 'N')
00920         continue;
00921     if (score > 20) score = 20;
00922     if (b == 't' || b == 'T')
00923         {
00924         score += 1;
00925         if (score >= bestScore)
00926             {
00927             bestScore = score;
00928             bestPos = i;
00929             }
00930         else if (loose && score >= (bestScore - 8))
00931             {
00932             /* If loose, keep extending even if score isn't back up to best. */
00933             bestPos = i;
00934             }
00935         }
00936     else
00937         {
00938         score -= 10;
00939         }
00940     if (score < 0)
00941         {
00942         pastPoly = i;
00943         break;
00944         }
00945     }
00946 if (bestPos >= 0)
00947     {
00948     trimSize = bestPos+1 - 2;   // Leave two for aa in taa stop codon
00949     if (trimSize > 0)
00950         {
00951         if (doMask)
00952             memset(dna, 'n', trimSize);
00953         }
00954     else
00955         trimSize = 0;
00956     }
00957 return trimSize;
00958 }
00959 
00960 int headPolyTSizeLoose(DNA *dna, int size)
00961 /* Return size of PolyT at start (if present).  This allows a few non-T's as 
00962  * noise to be trimmed too, but skips last two tt for revcomp'd taa stop 
00963  * codon.  
00964  * It is less conservative in extending the polyA region than maskHeadPolyT. */
00965 {
00966 return findHeadPolyTMaybeMask(dna, size, FALSE, TRUE);
00967 }
00968 
00969 int maskHeadPolyT(DNA *dna, int size)
00970 /* Convert PolyT at start.  This allows a few non-T's as noise to be 
00971  * trimmed too.  Returns number of bases trimmed.  */
00972 {
00973 return findHeadPolyTMaybeMask(dna, size, TRUE, FALSE);
00974 }
00975 
00976 boolean isDna(char *poly, int size)
00977 /* Return TRUE if letters in poly are at least 90% ACGTU */
00978 {
00979 int i;
00980 int dnaCount = 0;
00981 
00982 dnaUtilOpen();
00983 for (i=0; i<size; ++i)
00984     {
00985     if (ntChars[(int)poly[i]]) 
00986         dnaCount += 1;
00987     }
00988 return (dnaCount >= round(0.9 * size));
00989 }
00990 
00991 boolean isAllDna(char *poly, int size)
00992 /* Return TRUE if letters in poly are 100% ACGTU */
00993 {
00994 int i;
00995 
00996 if (size <= 1)
00997     return FALSE;
00998 dnaUtilOpen();
00999 for (i=0; i<size-1; ++i)
01000     {
01001     if (ntChars[(int)poly[i]] == 0) 
01002         return FALSE;
01003     }
01004 return TRUE;
01005 }
01006 
01007 
01008 
01009 /* Tables to convert from 0-20 to ascii single letter representation
01010  * of proteins. */
01011 int aaVal[256];
01012 AA valToAa[20];
01013 
01014 AA aaChars[256];        /* 0 except for value aa characters.  Converts to upper case rest. */
01015 
01016 struct aminoAcidTable
01017 /* A little info about each amino acid. */
01018     {
01019     int ix;
01020     char letter;
01021     char abbreviation[3];
01022     char *name;
01023     };
01024 
01025 struct aminoAcidTable aminoAcidTable[] = 
01026 {
01027     {0, 'A', "ala", "alanine"},
01028     {1, 'C', "cys", "cysteine"},
01029     {2, 'D', "asp",  "aspartic acid"},
01030     {3, 'E', "glu",  "glutamic acid"},
01031     {4, 'F', "phe",  "phenylalanine"},
01032     {5, 'G', "gly",  "glycine"},
01033     {6, 'H', "his",  "histidine"},
01034     {7, 'I', "ile",  "isoleucine"},
01035     {8, 'K', "lys",  "lysine"},
01036     {9, 'L', "leu",  "leucine"},
01037     {10, 'M',  "met", "methionine"},
01038     {11, 'N',  "asn", "asparagine"},
01039     {12, 'P',  "pro", "proline"},
01040     {13, 'Q',  "gln", "glutamine"},
01041     {14, 'R',  "arg", "arginine"},
01042     {15, 'S',  "ser", "serine"},
01043     {16, 'T',  "thr", "threonine"},
01044     {17, 'V',  "val", "valine"},
01045     {18, 'W',  "try", "tryptophan"},
01046     {19, 'Y',  "tyr", "tyrosine"},
01047 };
01048 
01049 static void initAaVal()
01050 /* Initialize aaVal and valToAa tables. */
01051 {
01052 int i;
01053 char c, lowc;
01054 
01055 for (i=0; i<ArraySize(aaVal); ++i)
01056     aaVal[i] = -1;
01057 for (i=0; i<ArraySize(aminoAcidTable); ++i)
01058     {
01059     c = aminoAcidTable[i].letter;
01060     lowc = tolower(c);
01061     aaVal[(int)c] = aaVal[(int)lowc] = i;
01062     aaChars[(int)c] = aaChars[(int)lowc] = c;
01063     valToAa[i] = c;
01064     }
01065 aaChars['x'] = aaChars['X'] = 'X';
01066 }
01067 
01068 void dnaUtilOpen()
01069 /* Initialize stuff herein. */
01070 {
01071 static boolean opened = FALSE;
01072 if (!opened)
01073     {
01074     checkSizeTypes();
01075     initNtVal();
01076     initAaVal();
01077     initNtChars();
01078     initNtMixedCaseChars();
01079     initNtCompTable();
01080     opened = TRUE;
01081     }
01082 }
01083 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2