00001 /* Some stuff that you'll likely need in any program that works with 00002 * DNA. Includes stuff for amino acids as well. 00003 * 00004 * Assumes that DNA is stored as a character. 00005 * The DNA it generates will include the bases 00006 * as lowercase tcag. It will generally accept 00007 * uppercase as well, and also 'n' or 'N' or '-' 00008 * for unknown bases. 00009 * 00010 * Amino acids are stored as single character upper case. 00011 * 00012 * This file is copyright 2002 Jim Kent, but license is hereby 00013 * granted for all use - public, private or commercial. */ 00014 00015 00016 #ifndef DNAUTIL_H 00017 #define DNAUTIL_H 00018 00019 void dnaUtilOpen(); /* Good idea to call this before using any arrays 00020 * here. */ 00021 00022 /* Numerical values for bases. */ 00023 #define T_BASE_VAL 0 00024 #define U_BASE_VAL 0 00025 #define C_BASE_VAL 1 00026 #define A_BASE_VAL 2 00027 #define G_BASE_VAL 3 00028 #define N_BASE_VAL 4 /* Used in 1/2 byte representation. */ 00029 00030 typedef char DNA; 00031 typedef char AA; 00032 typedef char BIOPOL; /* Biological polymer. */ 00033 00034 /* A little array to help us decide if a character is a 00035 * nucleotide, and if so convert it to lower case. 00036 * Contains zeroes for characters that aren't used 00037 * in DNA sequence. */ 00038 extern DNA ntChars[256]; 00039 extern AA aaChars[256]; 00040 00041 /* An array that converts alphabetical DNA representation 00042 * to numerical one: X_BASE_VAL as above. For charaters 00043 * other than [atgcATGC], has -1. */ 00044 extern int ntVal[256]; 00045 extern int aaVal[256]; 00046 extern int ntValLower[256]; /* NT values only for lower case. */ 00047 extern int ntValUpper[256]; /* NT values only for upper case. */ 00048 00049 /* Like ntVal, but with T_BASE_VAL in place of -1 for nonexistent nucleotides. */ 00050 extern int ntValNoN[256]; 00051 00052 /* Like ntVal but with N_BASE_VAL in place of -1 for 'n', 'x', '-', etc. */ 00053 extern int ntVal5[256]; 00054 00055 /* Inverse array - takes X_BASE_VAL int to a DNA char 00056 * value. */ 00057 extern DNA valToNt[]; 00058 00059 /* Similar array that doesn't convert to lower case. */ 00060 extern DNA ntMixedCaseChars[256]; 00061 00062 /* Another array to help us do complement of DNA */ 00063 extern DNA ntCompTable[256]; 00064 00065 /* Arrays to convert between lower case indicating repeat masking, and 00066 * a 1/2 byte representation where the 4th bit indicates if the characeter 00067 * is masked. Uses N_BASE_VAL for `n', `x', etc. 00068 */ 00069 #define MASKED_BASE_BIT 8 00070 extern int ntValMasked[256]; 00071 extern DNA valToNtMasked[256]; 00072 00073 /*Complement DNA (not reverse)*/ 00074 void complement(DNA *dna, long length); 00075 00076 /* Reverse complement DNA. */ 00077 void reverseComplement(DNA *dna, long length); 00078 00079 00080 /* Reverse offset - return what will be offset (0 based) to 00081 * same member of array after array is reversed. */ 00082 long reverseOffset(long offset, long arraySize); 00083 00084 /* Switch start/end (zero based half open) coordinates 00085 * to opposite strand. */ 00086 void reverseIntRange(int *pStart, int *pEnd, int size); 00087 00088 /* Switch start/end (zero based half open) coordinates 00089 * to opposite strand. */ 00090 void reverseUnsignedRange(unsigned *pStart, unsigned *pEnd, int size); 00091 00092 enum dnaCase {dnaUpper,dnaLower,dnaMixed,}; 00093 /* DNA upper, lower, or mixed case? */ 00094 00095 /* Convert T's to U's */ 00096 void toRna(DNA *dna); 00097 00098 typedef char Codon; /* Our codon type. */ 00099 00100 /* Return single letter code (upper case) for protein. 00101 * Returns X for bad input, 0 for stop codon. 00102 * The "Standard" Code */ 00103 AA lookupCodon(DNA *dna); 00104 00105 boolean isStopCodon(DNA *dna); 00106 /* Return TRUE if it's a stop codon. */ 00107 00108 boolean isKozak(char *dna, int dnaSize, int pos); 00109 /* Return TRUE if it's a Kozak compatible start, using a relatively 00110 * weak definition (either A/G 3 bases before or G after) . */ 00111 00112 boolean isReallyStopCodon(char *dna, boolean selenocysteine); 00113 /* Return TRUE if it's really a stop codon, even considering 00114 * possibilility of selenocysteine. */ 00115 00116 /* Returns one letter code for protein, 00117 * 0 for stop codon or X for bad input, 00118 * Vertebrate Mitochondrial Code */ 00119 AA lookupMitoCodon(DNA *dna); 00120 00121 Codon codonVal(DNA *start); 00122 /* Return value from 0-63 of codon starting at start. 00123 * Returns -1 if not a codon. */ 00124 00125 DNA *valToCodon(int val); 00126 /* Return codon corresponding to val (0-63) */ 00127 00128 void dnaTranslateSome(DNA *dna, char *out, int outSize); 00129 /* Translate DNA upto a stop codon or until outSize-1 amino acids, 00130 * whichever comes first. Output will be zero terminated. */ 00131 00132 char *skipIgnoringDash(char *a, int size, bool skipTrailingDash); 00133 /* Count size number of characters, and any 00134 * dash characters. */ 00135 00136 int countNonDash(char *a, int size); 00137 /* Count number of non-dash characters. */ 00138 00139 int nextPowerOfFour(long x); 00140 /* Return next power of four that would be greater or equal to x. 00141 * For instance if x < 4, return 1, if x < 16 return 2.... 00142 * (From biological point of view how many bases are needed to 00143 * code this number.) */ 00144 00145 long dnaFilteredSize(char *rawDna); 00146 /* Return how long DNA will be after non-DNA is filtered out. */ 00147 00148 long aaFilteredSize(char *rawDna); 00149 /* Return how long peptide will be after non-peptide is filtered out. */ 00150 00151 void dnaFilter(char *in, DNA *out); 00152 /* Filter out non-DNA characters. */ 00153 00154 void aaFilter(char *in, DNA *out); 00155 /* Filter out non-peptide characters. */ 00156 00157 void dnaFilterToN(char *in, DNA *out); 00158 /* Change all non-DNA characters to N. */ 00159 00160 void upperToN(char *s, int size); 00161 /* Turn upper case letters to N's. */ 00162 00163 void lowerToN(char *s, int size); 00164 /* Turn lower case letters to N's. */ 00165 00166 void dnaBaseHistogram(DNA *dna, int dnaSize, int histogram[4]); 00167 /* Count up frequency of occurance of each base and store 00168 * results in histogram. Use X_BASE_VAL to index histogram. */ 00169 00170 void dnaMixedCaseFilter(char *in, DNA *out); 00171 /* Filter out non-DNA characters but leave case intact. */ 00172 00173 bits32 packDna16(DNA *in); 00174 /* pack 16 bases into a word */ 00175 00176 bits16 packDna8(DNA *in); 00177 /* Pack 8 bases into a short word */ 00178 00179 UBYTE packDna4(DNA *in); 00180 /* Pack 4 bases into a UBYTE */ 00181 00182 void unpackDna(bits32 *tiles, int tileCount, DNA *out); 00183 /* Unpack DNA. Expands to 16x tileCount in output. */ 00184 00185 void unpackDna4(UBYTE *tiles, int byteCount, DNA *out); 00186 /* Unpack DNA. Expands to 4x byteCount in output. */ 00187 00188 void unalignedUnpackDna(bits32 *tiles, int start, int size, DNA *unpacked); 00189 /* Unpack into out, even though not starting/stopping on tile 00190 * boundaries. */ 00191 00192 int intronOrientationMinSize(DNA *iStart, DNA *iEnd, int minIntronSize); 00193 /* Given a gap in genome from iStart to iEnd, return 00194 * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no 00195 * intron. */ 00196 00197 int intronOrientation(DNA *iStart, DNA *iEnd); 00198 /* Given a gap in genome from iStart to iEnd, return 00199 * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no 00200 * intron. Assumes minIntronSize of 32. */ 00201 00202 int dnaScore2(DNA a, DNA b); 00203 /* Score match between two bases (relatively crudely). */ 00204 00205 int dnaScoreMatch(DNA *a, DNA *b, int size); 00206 /* Compare two pieces of DNA base by base. Total mismatches are 00207 * subtracted from total matches and returned as score. 'N's 00208 * neither hurt nor help score. */ 00209 00210 int aaScore2(AA a, AA b); 00211 /* Score match between two bases (relatively crudely). */ 00212 00213 int aaScoreMatch(AA *a, AA *b, int size); 00214 /* Compare two peptides aa by aa. */ 00215 00216 int dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore, 00217 char ignore); 00218 /* Compare two sequences (without inserts or deletions) and score. */ 00219 00220 void writeSeqWithBreaks(FILE *f, char *letters, int letterCount, int maxPerLine); 00221 /* Write out letters with newlines every maxLine. */ 00222 00223 int tailPolyASizeLoose(DNA *dna, int size); 00224 /* Return size of PolyA at end (if present). This allows a few non-A's as 00225 * noise to be trimmed too, but skips first two aa for taa stop codon. 00226 * It is less conservative in extending the polyA region than maskTailPolyA. */ 00227 00228 int headPolyTSizeLoose(DNA *dna, int size); 00229 /* Return size of PolyT at start (if present). This allows a few non-T's as 00230 * noise to be trimmed too, but skips last two tt for revcomp'd taa stop 00231 * codon. 00232 * It is less conservative in extending the polyA region than maskHeadPolyT. */ 00233 00234 int maskTailPolyA(DNA *dna, int size); 00235 /* Convert PolyA at end to n. This allows a few non-A's as noise to be 00236 * trimmed too. Returns number of bases trimmed. */ 00237 00238 int maskHeadPolyT(DNA *dna, int size); 00239 /* Convert PolyT at start. This allows a few non-T's as noise to be 00240 * trimmed too. Returns number of bases trimmed. */ 00241 00242 boolean isDna(char *poly, int size); 00243 /* Return TRUE if letters in poly are at least 90% ACGTU */ 00244 00245 boolean isAllDna(char *poly, int size); 00246 /* Return TRUE if letters in poly are 100% ACGTU */ 00247 00248 #endif /* DNAUTIL_H */
1.5.2