inc/fuzzyFind.h

Go to the documentation of this file.
00001 /*****************************************************************************
00002  * Copyright (C) 2000 Jim Kent.  This source code may be freely used         *
00003  * for personal, academic, and non-profit purposes.  Commercial use          *
00004  * permitted only by explicit agreement with Jim Kent (jim_kent@pacbell.net) *
00005  *****************************************************************************/
00006 /* fuzzyFind.h - This is the interface to the fuzzyFind
00007  * DNA sequence aligner.  This just returns a single
00008  * alignment - the one the algorithm thinks is best.
00009  * The algorithm is heuristic, but pretty good.  (See
00010  * comments in the fuzzyFind.c file for more details.) It
00011  * is not good for finding distant homologies, but it
00012  * will fairly reliably align a somewhat noisy cDNA
00013  * sequence with genomic sequence.
00014  *
00015  * The main data structure is the ffAli - which is
00016  * a node in a doubly linked list.  The finder algorithm
00017  * returns a pointer to the leftmost node in the list.
00018  * When you're done with the alignment you can dispose
00019  * of it via ffFreeAli.
00020  *
00021  * The finder supports three levels of stringency.
00022  * Generally you're best off using "ffTight".  "ffLoose"
00023  * will allow for more distant matches, but at the 
00024  * expense of very often taking several seconds to
00025  * return a garbage alignment.  "ffExact" requires
00026  * an exact match - which is quick, but in the
00027  * real world not so often useful.
00028  *
00029  * If you want to compare alignments use ffScore.
00030  */
00031 
00032 #ifndef FUZZYFIND_H
00033 #define FUZZYFIND_H
00034 
00035 #ifndef MEMGFX_H
00036 #include "memgfx.h"
00037 #endif 
00038 
00039 #ifndef DNAUTIL_H
00040 #include "dnautil.h"
00041 #endif
00042 
00043 #ifndef LOCALMEM_H
00044 #include "localmem.h"
00045 #endif
00046 
00047 #ifndef ALITYPE_H
00048 #include "aliType.h"
00049 #endif
00050 
00051 struct ffAli
00052 /* Node of a doubly linked list that will contain one
00053  * alignment. Contains information on a matching
00054  * set of DNA between needle and haystack. */
00055     {
00056     struct ffAli *left;   /* Neighboring intervals. */
00057     struct ffAli *right;
00058     char *nStart, *nEnd;          /* Needle start and end. (1/2 open interval) */
00059     char *hStart, *hEnd;          /* Haystack start and end. */
00060     int startGood, endGood; /* Number that match perfectly on ends. */
00061     };
00062 
00063 /* maximum intron size for fuzzy find functions */
00064 
00065 #define ffIntronMaxDefault 750000       /* Default maximum intron size */
00066 
00067 extern int ffIntronMax;
00068 
00069 void setFfIntronMax(int value);         /* change max intron size */
00070 void setFfExtendThroughN(boolean val);  /* Set whether or not can extend through N's. */
00071 
00072 /************* lib/ffAli.c routines - using alignments ************/
00073 
00074 void ffFreeAli(struct ffAli **pAli);
00075 /* Dispose of memory gotten from fuzzyFind(). */
00076 
00077 int ffOneIntronOrientation(struct ffAli *left, struct ffAli *right);
00078 /* Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
00079  * intron. */
00080 
00081 int ffIntronOrientation(struct ffAli *ali);
00082 /* Return + for positive orientation overall, - for negative,
00083  * 0 if can't tell. */
00084 
00085 int ffScoreIntron(DNA a, DNA b, DNA y, DNA z, int orientation);
00086 /* Return a better score the closer an intron is to
00087  * consensus. Max score is 4. */
00088 
00089 struct ffAli *ffRightmost(struct ffAli *ff);
00090 /* Return rightmost block of alignment. */
00091 
00092 struct ffAli *ffMakeRightLinks(struct ffAli *rightMost);
00093 /* Given a pointer to the rightmost block in an alignment
00094  * which has all of the left pointers filled in, fill in
00095  * the right pointers and return the leftmost block. */
00096  
00097 void ffCountGoodEnds(struct ffAli *aliList);
00098 /* Fill in the goodEnd and badEnd scores. */
00099 
00100 int ffAliCount(struct ffAli *d);
00101 /* How many blocks in alignment? */
00102 
00103 struct ffAli *ffAliFromSym(int symCount, char *nSym, char *hSym,
00104         struct lm *lm, char *nStart, char *hStart);
00105 /* Convert symbol representation of alignments (letters plus '-')
00106  * to ffAli representation.  If lm is nonNULL, ffAli result 
00107  * will be lmAlloced, else it will be needMemed. This routine
00108  * depends on nSym/hSym being zero terminated. */
00109 
00110 /************* lib/ffScore.c routines - scoring alignments ************/
00111 
00112 int ffScoreMatch(DNA *a, DNA *b, int size);
00113 /* Compare two pieces of DNA base by base. Total mismatches are
00114  * subtracted from total matches and returned as score. 'N's 
00115  * neither hurt nor help score. */
00116 
00117 int ffScoreCdna(struct ffAli *ali);
00118 /* Return score of alignment.  A perfect alignment score will
00119  * be the number of bases in needle. */
00120 
00121 int ffScore(struct ffAli *ali, enum ffStringency stringency);
00122 /* Score DNA based alignment. */
00123 
00124 int ffScoreProtein(struct ffAli *ali, enum ffStringency stringency);
00125 /* Figure out overall score of protein alignment. */
00126 
00127 int ffScoreSomething(struct ffAli *ali, enum ffStringency stringency,
00128    boolean isProt);
00129 /* Score any alignment. */
00130 
00131 int ffScoreSomeAlis(struct ffAli *ali, int count, enum ffStringency stringency);
00132 /* Figure out score of count consecutive alis. */
00133 
00134 int ffCalcGapPenalty(int hGap, int nGap, enum ffStringency stringency);
00135 /* Return gap penalty for given h and n gaps. */
00136 
00137 int ffCalcCdnaGapPenalty(int hGap, int nGap);
00138 /* Return gap penalty for given h and n gaps in cDNA. */
00139 
00140 int ffGapPenalty(struct ffAli *ali, struct ffAli *right, enum ffStringency stringency);
00141 /* Calculate gap penaltly for alignment. */
00142 
00143 int ffCdnaGapPenalty(struct ffAli *ali, struct ffAli *right);
00144 /* Calculate gap penaltly for cdna alignment. */
00145 
00146 /************* jkOwnLib/ffAliHelp -helpers for alignment producers. ****************/
00147 
00148 boolean ffSlideIntrons(struct ffAli *ali);
00149 /* Slide introns (or spaces between aligned blocks)
00150  * to match consensus.  Return TRUE if any slid. */
00151 
00152 boolean ffSlideOrientedIntrons(struct ffAli *ali, int orient);
00153 /* Slide introns (or spaces between aligned blocks)
00154  * to match consensus on given strand (usually from ffIntronOrientation). */
00155 
00156 struct ffAli *ffRemoveEmptyAlis(struct ffAli *ali, boolean doFree);
00157 /* Remove empty blocks from list. Optionally free empties too. */
00158 
00159 struct ffAli *ffMergeHayOverlaps(struct ffAli *ali);
00160 /* Remove overlaps in haystack that perfectly abut in needle.
00161  * These are transformed into perfectly abutting haystacks
00162  * that have a gap in the needle. */
00163 
00164 struct ffAli *ffMergeNeedleAlis(struct ffAli *ali, boolean doFree);
00165 /* Remove overlapping areas needle in alignment. Assumes ali is sorted on
00166  * ascending nStart field. Also merge perfectly abutting neighbors.*/
00167 
00168 void ffExpandExactRight(struct ffAli *ali, DNA *needleEnd, DNA *hayEnd);
00169 /* Expand aligned segment to right as far as can exactly. */
00170 
00171 void ffExpandExactLeft(struct ffAli *ali, DNA *needleStart, DNA *hayStart);
00172 /* Expand aligned segment to left as far as can exactly. */
00173 
00174 struct ffAli *ffMergeClose(struct ffAli *aliList, 
00175         DNA *needleStart, DNA *hayStart);
00176 /* Remove overlapping areas needle in alignment. Assumes ali is sorted on
00177  * ascending nStart field. Also merge perfectly abutting neighbors or
00178  * ones that could be merged at the expense of just a few mismatches.*/
00179 
00180 void ffAliSort(struct ffAli **pList, 
00181         int (*compare )(const void *elem1,  const void *elem2));
00182 /* Sort a doubly linked list of ffAlis. */
00183 
00184 void ffCat(struct ffAli **pA, struct ffAli **pB);
00185 /* Concatenate B to the end of A. Eat up second list
00186  * in process. */
00187 
00188 int ffCmpHitsHayFirst(const void *va, const void *vb);
00189 /* Compare function to sort hit array by ascending
00190  * target offset followed by ascending query offset. */
00191 
00192 int ffCmpHitsNeedleFirst(const void *va, const void *vb);
00193 /* Compare function to sort hit array by ascending
00194  * query offset followed by ascending target offset. */
00195 
00196 /************* jkOwnLib/fuzzyFind - old local cDNA alignment. ****************/
00197 
00198 struct ffAli *ffFind(DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd,
00199     enum ffStringency stringency);
00200 /* Return an alignment of needle in haystack. (Returns left end of doubly
00201  * linked alignment list.) The input DNA is all expected to be lower case
00202  * characters - a, c, g, t, or n. */
00203 
00204 boolean ffFindEitherStrand(DNA *needle, DNA *haystack, enum ffStringency stringency,
00205     struct ffAli **pAli, boolean *pRcNeedle);
00206 /* Return TRUE if find an alignment using needle, or reverse complement of 
00207  * needle to search haystack. DNA must be lower case. Needle and haystack
00208  * are zero terminated. */
00209 
00210 boolean ffFindEitherStrandN(DNA *needle, int needleSize, DNA *haystack, int haySize,
00211     enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle);
00212 /* Return TRUE if find an alignment using needle, or reverse complement of 
00213  * needle to search haystack. DNA must be lower case. */
00214 
00215 boolean ffFindAndScore(DNA *needle, int needleSize, DNA *haystack, int haySize,
00216     enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle, int *pScore);
00217 /* Return TRUE if find an alignment using needle, or reverse complement of 
00218  * needle to search haystack. DNA must be lower case. If pScore is non-NULL returns
00219  * score of alignment. */
00220 
00221 /************* lib/fuzzyShow - display alignments. ****************/
00222 
00223 int ffShAliPart(FILE *f, struct ffAli *aliList, 
00224     char *needleName, DNA *needle, int needleSize, int needleNumOffset,
00225     char *haystackName, DNA *haystack, int haySize, int hayNumOffset,
00226     int blockMaxGap, boolean rcNeedle, boolean rcHaystack,
00227     boolean showJumpTable, 
00228     boolean showNeedle, boolean showHaystack,
00229     boolean showSideBySide, boolean upcMatch,
00230     int cdsS, int cdsE, int hayPartS, int hayPartE);
00231 /* Display parts of alignment on html page.  If hayPartS..hayPartE is a 
00232  * smaller subrange of the alignment, highlight that part of the alignment 
00233  * in both needle and haystack with underline & bold, and show only that 
00234  * part of the haystack (plus padding).  Returns number of blocks (after
00235  * merging blocks separated by blockMaxGap or less). */
00236 
00237 int ffShAli(FILE *f, struct ffAli *aliList, 
00238     char *needleName, DNA *needle, int needleSize, int needleNumOffset,
00239     char *haystackName, DNA *haystack, int haySize, int hayNumOffset,
00240     int blockMaxGap,
00241     boolean rcNeedle);
00242 /* Display alignment on html page.  Returns number of blocks (after
00243  * merging blocks separated by blockMaxGap or less). */
00244 
00245 void ffShowAli(struct ffAli *aliList, 
00246     char *needleName, DNA *needle, int needleNumOffset,
00247     char *haystackName, DNA *haystack, int hayNumOffset,
00248     boolean rcNeedle);
00249 /* Display alignment on html page to stdout. */
00250 
00251 #endif /* FUZZYFIND_H */
00252 

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2