00001 /***************************************************************************** 00002 * Copyright (C) 2000 Jim Kent. This source code may be freely used * 00003 * for personal, academic, and non-profit purposes. Commercial use * 00004 * permitted only by explicit agreement with Jim Kent (jim_kent@pacbell.net) * 00005 *****************************************************************************/ 00006 /* fuzzyFind.h - This is the interface to the fuzzyFind 00007 * DNA sequence aligner. This just returns a single 00008 * alignment - the one the algorithm thinks is best. 00009 * The algorithm is heuristic, but pretty good. (See 00010 * comments in the fuzzyFind.c file for more details.) It 00011 * is not good for finding distant homologies, but it 00012 * will fairly reliably align a somewhat noisy cDNA 00013 * sequence with genomic sequence. 00014 * 00015 * The main data structure is the ffAli - which is 00016 * a node in a doubly linked list. The finder algorithm 00017 * returns a pointer to the leftmost node in the list. 00018 * When you're done with the alignment you can dispose 00019 * of it via ffFreeAli. 00020 * 00021 * The finder supports three levels of stringency. 00022 * Generally you're best off using "ffTight". "ffLoose" 00023 * will allow for more distant matches, but at the 00024 * expense of very often taking several seconds to 00025 * return a garbage alignment. "ffExact" requires 00026 * an exact match - which is quick, but in the 00027 * real world not so often useful. 00028 * 00029 * If you want to compare alignments use ffScore. 00030 */ 00031 00032 #ifndef FUZZYFIND_H 00033 #define FUZZYFIND_H 00034 00035 #ifndef MEMGFX_H 00036 #include "memgfx.h" 00037 #endif 00038 00039 #ifndef DNAUTIL_H 00040 #include "dnautil.h" 00041 #endif 00042 00043 #ifndef LOCALMEM_H 00044 #include "localmem.h" 00045 #endif 00046 00047 #ifndef ALITYPE_H 00048 #include "aliType.h" 00049 #endif 00050 00051 struct ffAli 00052 /* Node of a doubly linked list that will contain one 00053 * alignment. Contains information on a matching 00054 * set of DNA between needle and haystack. */ 00055 { 00056 struct ffAli *left; /* Neighboring intervals. */ 00057 struct ffAli *right; 00058 char *nStart, *nEnd; /* Needle start and end. (1/2 open interval) */ 00059 char *hStart, *hEnd; /* Haystack start and end. */ 00060 int startGood, endGood; /* Number that match perfectly on ends. */ 00061 }; 00062 00063 /* maximum intron size for fuzzy find functions */ 00064 00065 #define ffIntronMaxDefault 750000 /* Default maximum intron size */ 00066 00067 extern int ffIntronMax; 00068 00069 void setFfIntronMax(int value); /* change max intron size */ 00070 void setFfExtendThroughN(boolean val); /* Set whether or not can extend through N's. */ 00071 00072 /************* lib/ffAli.c routines - using alignments ************/ 00073 00074 void ffFreeAli(struct ffAli **pAli); 00075 /* Dispose of memory gotten from fuzzyFind(). */ 00076 00077 int ffOneIntronOrientation(struct ffAli *left, struct ffAli *right); 00078 /* Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no 00079 * intron. */ 00080 00081 int ffIntronOrientation(struct ffAli *ali); 00082 /* Return + for positive orientation overall, - for negative, 00083 * 0 if can't tell. */ 00084 00085 int ffScoreIntron(DNA a, DNA b, DNA y, DNA z, int orientation); 00086 /* Return a better score the closer an intron is to 00087 * consensus. Max score is 4. */ 00088 00089 struct ffAli *ffRightmost(struct ffAli *ff); 00090 /* Return rightmost block of alignment. */ 00091 00092 struct ffAli *ffMakeRightLinks(struct ffAli *rightMost); 00093 /* Given a pointer to the rightmost block in an alignment 00094 * which has all of the left pointers filled in, fill in 00095 * the right pointers and return the leftmost block. */ 00096 00097 void ffCountGoodEnds(struct ffAli *aliList); 00098 /* Fill in the goodEnd and badEnd scores. */ 00099 00100 int ffAliCount(struct ffAli *d); 00101 /* How many blocks in alignment? */ 00102 00103 struct ffAli *ffAliFromSym(int symCount, char *nSym, char *hSym, 00104 struct lm *lm, char *nStart, char *hStart); 00105 /* Convert symbol representation of alignments (letters plus '-') 00106 * to ffAli representation. If lm is nonNULL, ffAli result 00107 * will be lmAlloced, else it will be needMemed. This routine 00108 * depends on nSym/hSym being zero terminated. */ 00109 00110 /************* lib/ffScore.c routines - scoring alignments ************/ 00111 00112 int ffScoreMatch(DNA *a, DNA *b, int size); 00113 /* Compare two pieces of DNA base by base. Total mismatches are 00114 * subtracted from total matches and returned as score. 'N's 00115 * neither hurt nor help score. */ 00116 00117 int ffScoreCdna(struct ffAli *ali); 00118 /* Return score of alignment. A perfect alignment score will 00119 * be the number of bases in needle. */ 00120 00121 int ffScore(struct ffAli *ali, enum ffStringency stringency); 00122 /* Score DNA based alignment. */ 00123 00124 int ffScoreProtein(struct ffAli *ali, enum ffStringency stringency); 00125 /* Figure out overall score of protein alignment. */ 00126 00127 int ffScoreSomething(struct ffAli *ali, enum ffStringency stringency, 00128 boolean isProt); 00129 /* Score any alignment. */ 00130 00131 int ffScoreSomeAlis(struct ffAli *ali, int count, enum ffStringency stringency); 00132 /* Figure out score of count consecutive alis. */ 00133 00134 int ffCalcGapPenalty(int hGap, int nGap, enum ffStringency stringency); 00135 /* Return gap penalty for given h and n gaps. */ 00136 00137 int ffCalcCdnaGapPenalty(int hGap, int nGap); 00138 /* Return gap penalty for given h and n gaps in cDNA. */ 00139 00140 int ffGapPenalty(struct ffAli *ali, struct ffAli *right, enum ffStringency stringency); 00141 /* Calculate gap penaltly for alignment. */ 00142 00143 int ffCdnaGapPenalty(struct ffAli *ali, struct ffAli *right); 00144 /* Calculate gap penaltly for cdna alignment. */ 00145 00146 /************* jkOwnLib/ffAliHelp -helpers for alignment producers. ****************/ 00147 00148 boolean ffSlideIntrons(struct ffAli *ali); 00149 /* Slide introns (or spaces between aligned blocks) 00150 * to match consensus. Return TRUE if any slid. */ 00151 00152 boolean ffSlideOrientedIntrons(struct ffAli *ali, int orient); 00153 /* Slide introns (or spaces between aligned blocks) 00154 * to match consensus on given strand (usually from ffIntronOrientation). */ 00155 00156 struct ffAli *ffRemoveEmptyAlis(struct ffAli *ali, boolean doFree); 00157 /* Remove empty blocks from list. Optionally free empties too. */ 00158 00159 struct ffAli *ffMergeHayOverlaps(struct ffAli *ali); 00160 /* Remove overlaps in haystack that perfectly abut in needle. 00161 * These are transformed into perfectly abutting haystacks 00162 * that have a gap in the needle. */ 00163 00164 struct ffAli *ffMergeNeedleAlis(struct ffAli *ali, boolean doFree); 00165 /* Remove overlapping areas needle in alignment. Assumes ali is sorted on 00166 * ascending nStart field. Also merge perfectly abutting neighbors.*/ 00167 00168 void ffExpandExactRight(struct ffAli *ali, DNA *needleEnd, DNA *hayEnd); 00169 /* Expand aligned segment to right as far as can exactly. */ 00170 00171 void ffExpandExactLeft(struct ffAli *ali, DNA *needleStart, DNA *hayStart); 00172 /* Expand aligned segment to left as far as can exactly. */ 00173 00174 struct ffAli *ffMergeClose(struct ffAli *aliList, 00175 DNA *needleStart, DNA *hayStart); 00176 /* Remove overlapping areas needle in alignment. Assumes ali is sorted on 00177 * ascending nStart field. Also merge perfectly abutting neighbors or 00178 * ones that could be merged at the expense of just a few mismatches.*/ 00179 00180 void ffAliSort(struct ffAli **pList, 00181 int (*compare )(const void *elem1, const void *elem2)); 00182 /* Sort a doubly linked list of ffAlis. */ 00183 00184 void ffCat(struct ffAli **pA, struct ffAli **pB); 00185 /* Concatenate B to the end of A. Eat up second list 00186 * in process. */ 00187 00188 int ffCmpHitsHayFirst(const void *va, const void *vb); 00189 /* Compare function to sort hit array by ascending 00190 * target offset followed by ascending query offset. */ 00191 00192 int ffCmpHitsNeedleFirst(const void *va, const void *vb); 00193 /* Compare function to sort hit array by ascending 00194 * query offset followed by ascending target offset. */ 00195 00196 /************* jkOwnLib/fuzzyFind - old local cDNA alignment. ****************/ 00197 00198 struct ffAli *ffFind(DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd, 00199 enum ffStringency stringency); 00200 /* Return an alignment of needle in haystack. (Returns left end of doubly 00201 * linked alignment list.) The input DNA is all expected to be lower case 00202 * characters - a, c, g, t, or n. */ 00203 00204 boolean ffFindEitherStrand(DNA *needle, DNA *haystack, enum ffStringency stringency, 00205 struct ffAli **pAli, boolean *pRcNeedle); 00206 /* Return TRUE if find an alignment using needle, or reverse complement of 00207 * needle to search haystack. DNA must be lower case. Needle and haystack 00208 * are zero terminated. */ 00209 00210 boolean ffFindEitherStrandN(DNA *needle, int needleSize, DNA *haystack, int haySize, 00211 enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle); 00212 /* Return TRUE if find an alignment using needle, or reverse complement of 00213 * needle to search haystack. DNA must be lower case. */ 00214 00215 boolean ffFindAndScore(DNA *needle, int needleSize, DNA *haystack, int haySize, 00216 enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle, int *pScore); 00217 /* Return TRUE if find an alignment using needle, or reverse complement of 00218 * needle to search haystack. DNA must be lower case. If pScore is non-NULL returns 00219 * score of alignment. */ 00220 00221 /************* lib/fuzzyShow - display alignments. ****************/ 00222 00223 int ffShAliPart(FILE *f, struct ffAli *aliList, 00224 char *needleName, DNA *needle, int needleSize, int needleNumOffset, 00225 char *haystackName, DNA *haystack, int haySize, int hayNumOffset, 00226 int blockMaxGap, boolean rcNeedle, boolean rcHaystack, 00227 boolean showJumpTable, 00228 boolean showNeedle, boolean showHaystack, 00229 boolean showSideBySide, boolean upcMatch, 00230 int cdsS, int cdsE, int hayPartS, int hayPartE); 00231 /* Display parts of alignment on html page. If hayPartS..hayPartE is a 00232 * smaller subrange of the alignment, highlight that part of the alignment 00233 * in both needle and haystack with underline & bold, and show only that 00234 * part of the haystack (plus padding). Returns number of blocks (after 00235 * merging blocks separated by blockMaxGap or less). */ 00236 00237 int ffShAli(FILE *f, struct ffAli *aliList, 00238 char *needleName, DNA *needle, int needleSize, int needleNumOffset, 00239 char *haystackName, DNA *haystack, int haySize, int hayNumOffset, 00240 int blockMaxGap, 00241 boolean rcNeedle); 00242 /* Display alignment on html page. Returns number of blocks (after 00243 * merging blocks separated by blockMaxGap or less). */ 00244 00245 void ffShowAli(struct ffAli *aliList, 00246 char *needleName, DNA *needle, int needleNumOffset, 00247 char *haystackName, DNA *haystack, int hayNumOffset, 00248 boolean rcNeedle); 00249 /* Display alignment on html page to stdout. */ 00250 00251 #endif /* FUZZYFIND_H */ 00252
1.5.2