inc/dnautil.h

Go to the documentation of this file.
00001 /* Some stuff that you'll likely need in any program that works with
00002  * DNA.  Includes stuff for amino acids as well. 
00003  *
00004  * Assumes that DNA is stored as a character.
00005  * The DNA it generates will include the bases 
00006  * as lowercase tcag.  It will generally accept
00007  * uppercase as well, and also 'n' or 'N' or '-'
00008  * for unknown bases. 
00009  *
00010  * Amino acids are stored as single character upper case. 
00011  *
00012  * This file is copyright 2002 Jim Kent, but license is hereby
00013  * granted for all use - public, private or commercial. */
00014 
00015 
00016 #ifndef DNAUTIL_H
00017 #define DNAUTIL_H
00018 
00019 void dnaUtilOpen(); /* Good idea to call this before using any arrays
00020                      * here.  */
00021 
00022 /* Numerical values for bases. */
00023 #define T_BASE_VAL 0
00024 #define U_BASE_VAL 0
00025 #define C_BASE_VAL 1
00026 #define A_BASE_VAL 2
00027 #define G_BASE_VAL 3
00028 #define N_BASE_VAL 4   /* Used in 1/2 byte representation. */
00029 
00030 typedef char DNA;
00031 typedef char AA;
00032 typedef char BIOPOL;    /* Biological polymer. */
00033 
00034 /* A little array to help us decide if a character is a 
00035  * nucleotide, and if so convert it to lower case. 
00036  * Contains zeroes for characters that aren't used
00037  * in DNA sequence. */
00038 extern DNA ntChars[256];
00039 extern AA aaChars[256];
00040 
00041 /* An array that converts alphabetical DNA representation
00042  * to numerical one: X_BASE_VAL as above.  For charaters
00043  * other than [atgcATGC], has -1. */
00044 extern int ntVal[256];
00045 extern int aaVal[256];
00046 extern int ntValLower[256];     /* NT values only for lower case. */
00047 extern int ntValUpper[256];     /* NT values only for upper case. */
00048 
00049 /* Like ntVal, but with T_BASE_VAL in place of -1 for nonexistent nucleotides. */
00050 extern int ntValNoN[256];     
00051 
00052 /* Like ntVal but with N_BASE_VAL in place of -1 for 'n', 'x', '-', etc. */
00053 extern int ntVal5[256];
00054 
00055 /* Inverse array - takes X_BASE_VAL int to a DNA char
00056  * value. */
00057 extern DNA valToNt[];
00058 
00059 /* Similar array that doesn't convert to lower case. */
00060 extern DNA ntMixedCaseChars[256];
00061 
00062 /* Another array to help us do complement of DNA  */
00063 extern DNA ntCompTable[256];
00064 
00065 /* Arrays to convert between lower case indicating repeat masking, and
00066  * a 1/2 byte representation where the 4th bit indicates if the characeter
00067  * is masked. Uses N_BASE_VAL for `n', `x', etc.
00068 */
00069 #define MASKED_BASE_BIT 8
00070 extern int ntValMasked[256];
00071 extern DNA valToNtMasked[256];
00072 
00073 /*Complement DNA (not reverse)*/
00074 void complement(DNA *dna, long length);
00075 
00076 /* Reverse complement DNA. */
00077 void reverseComplement(DNA *dna, long length);
00078 
00079 
00080 /* Reverse offset - return what will be offset (0 based) to
00081  * same member of array after array is reversed. */
00082 long reverseOffset(long offset, long arraySize);
00083 
00084 /* Switch start/end (zero based half open) coordinates
00085  * to opposite strand. */
00086 void reverseIntRange(int *pStart, int *pEnd, int size);
00087 
00088 /* Switch start/end (zero based half open) coordinates
00089  * to opposite strand. */
00090 void reverseUnsignedRange(unsigned *pStart, unsigned *pEnd, int size); 
00091 
00092 enum dnaCase {dnaUpper,dnaLower,dnaMixed,};
00093 /* DNA upper, lower, or mixed case? */
00094 
00095 /* Convert T's to U's */
00096 void toRna(DNA *dna);
00097 
00098 typedef char Codon; /* Our codon type. */
00099 
00100 /* Return single letter code (upper case) for protein.
00101  * Returns X for bad input, 0 for stop codon.
00102  * The "Standard" Code */
00103 AA lookupCodon(DNA *dna); 
00104 
00105 boolean isStopCodon(DNA *dna);
00106 /* Return TRUE if it's a stop codon. */
00107 
00108 boolean isKozak(char *dna, int dnaSize, int pos);
00109 /* Return TRUE if it's a Kozak compatible start, using a relatively
00110  * weak definition (either A/G 3 bases before or G after) . */
00111 
00112 boolean isReallyStopCodon(char *dna, boolean selenocysteine);
00113 /* Return TRUE if it's really a stop codon, even considering
00114  * possibilility of selenocysteine. */
00115 
00116 /* Returns one letter code for protein, 
00117  * 0 for stop codon or X for bad input,
00118  * Vertebrate Mitochondrial Code */
00119 AA lookupMitoCodon(DNA *dna);
00120 
00121 Codon codonVal(DNA *start);
00122 /* Return value from 0-63 of codon starting at start. 
00123  * Returns -1 if not a codon. */
00124 
00125 DNA *valToCodon(int val);
00126 /* Return  codon corresponding to val (0-63) */
00127 
00128 void dnaTranslateSome(DNA *dna, char *out, int outSize);
00129 /* Translate DNA upto a stop codon or until outSize-1 amino acids, 
00130  * whichever comes first. Output will be zero terminated. */
00131 
00132 char *skipIgnoringDash(char *a, int size, bool skipTrailingDash);
00133 /* Count size number of characters, and any 
00134  * dash characters. */
00135 
00136 int countNonDash(char *a, int size);
00137 /* Count number of non-dash characters. */
00138 
00139 int nextPowerOfFour(long x);
00140 /* Return next power of four that would be greater or equal to x.
00141  * For instance if x < 4, return 1, if x < 16 return 2.... 
00142  * (From biological point of view how many bases are needed to
00143  * code this number.) */
00144 
00145 long dnaFilteredSize(char *rawDna);
00146 /* Return how long DNA will be after non-DNA is filtered out. */
00147 
00148 long aaFilteredSize(char *rawDna);
00149 /* Return how long peptide will be after non-peptide is filtered out. */
00150 
00151 void dnaFilter(char *in, DNA *out);
00152 /* Filter out non-DNA characters. */
00153 
00154 void aaFilter(char *in, DNA *out);
00155 /* Filter out non-peptide characters. */
00156 
00157 void dnaFilterToN(char *in, DNA *out);
00158 /* Change all non-DNA characters to N. */
00159 
00160 void upperToN(char *s, int size);
00161 /* Turn upper case letters to N's. */
00162 
00163 void lowerToN(char *s, int size);
00164 /* Turn lower case letters to N's. */
00165 
00166 void dnaBaseHistogram(DNA *dna, int dnaSize, int histogram[4]);
00167 /* Count up frequency of occurance of each base and store 
00168  * results in histogram. Use X_BASE_VAL to index histogram. */
00169 
00170 void dnaMixedCaseFilter(char *in, DNA *out);
00171 /* Filter out non-DNA characters but leave case intact. */
00172 
00173 bits32 packDna16(DNA *in);
00174 /* pack 16 bases into a word */
00175 
00176 bits16 packDna8(DNA *in);
00177 /* Pack 8 bases into a short word */
00178 
00179 UBYTE packDna4(DNA *in);
00180 /* Pack 4 bases into a UBYTE */
00181 
00182 void unpackDna(bits32 *tiles, int tileCount, DNA *out);
00183 /* Unpack DNA. Expands to 16x tileCount in output. */
00184 
00185 void unpackDna4(UBYTE *tiles, int byteCount, DNA *out);
00186 /* Unpack DNA. Expands to 4x byteCount in output. */
00187 
00188 void unalignedUnpackDna(bits32 *tiles, int start, int size, DNA *unpacked);
00189 /* Unpack into out, even though not starting/stopping on tile 
00190  * boundaries. */
00191 
00192 int intronOrientationMinSize(DNA *iStart, DNA *iEnd, int minIntronSize);
00193 /* Given a gap in genome from iStart to iEnd, return 
00194  * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
00195  * intron. */
00196 
00197 int intronOrientation(DNA *iStart, DNA *iEnd);
00198 /* Given a gap in genome from iStart to iEnd, return 
00199  * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
00200  * intron.  Assumes minIntronSize of 32. */
00201 
00202 int dnaScore2(DNA a, DNA b);
00203 /* Score match between two bases (relatively crudely). */
00204 
00205 int dnaScoreMatch(DNA *a, DNA *b, int size);
00206 /* Compare two pieces of DNA base by base. Total mismatches are
00207  * subtracted from total matches and returned as score. 'N's 
00208  * neither hurt nor help score. */
00209 
00210 int aaScore2(AA a, AA b);
00211 /* Score match between two bases (relatively crudely). */
00212 
00213 int aaScoreMatch(AA *a, AA *b, int size);
00214 /* Compare two peptides aa by aa. */
00215 
00216 int  dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore, 
00217         char ignore);
00218 /* Compare two sequences (without inserts or deletions) and score. */
00219 
00220 void writeSeqWithBreaks(FILE *f, char *letters, int letterCount, int maxPerLine);
00221 /* Write out letters with newlines every maxLine. */
00222 
00223 int tailPolyASizeLoose(DNA *dna, int size);
00224 /* Return size of PolyA at end (if present).  This allows a few non-A's as 
00225  * noise to be trimmed too, but skips first two aa for taa stop codon. 
00226  * It is less conservative in extending the polyA region than maskTailPolyA. */
00227 
00228 int headPolyTSizeLoose(DNA *dna, int size);
00229 /* Return size of PolyT at start (if present).  This allows a few non-T's as 
00230  * noise to be trimmed too, but skips last two tt for revcomp'd taa stop 
00231  * codon.  
00232  * It is less conservative in extending the polyA region than maskHeadPolyT. */
00233 
00234 int maskTailPolyA(DNA *dna, int size);
00235 /* Convert PolyA at end to n.  This allows a few non-A's as noise to be 
00236  * trimmed too.  Returns number of bases trimmed.  */
00237 
00238 int maskHeadPolyT(DNA *dna, int size);
00239 /* Convert PolyT at start.  This allows a few non-T's as noise to be 
00240  * trimmed too.  Returns number of bases trimmed.  */
00241 
00242 boolean isDna(char *poly, int size);
00243 /* Return TRUE if letters in poly are at least 90% ACGTU */
00244 
00245 boolean isAllDna(char *poly, int size);
00246 /* Return TRUE if letters in poly are 100% ACGTU */
00247 
00248 #endif /* DNAUTIL_H */

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2