inc/axt.h

Go to the documentation of this file.
00001 /* AXT - A simple alignment format with four lines per
00002  * alignment.  The first specifies the names of the
00003  * two sequences that align and the position of the
00004  * alignment, as well as the strand.  The next two
00005  * lines contain the alignment itself including dashes
00006  * for inserts.  The alignment is separated from the
00007  * next alignment with a blank line. 
00008  *
00009  * This file contains routines to read such alignments.
00010  * Note that though the coordinates are one based and
00011  * closed on disk, they get converted to our usual half
00012  * open zero based in memory. 
00013  *
00014  * This also contains code for simple DNA alignment scoring
00015  * schemes. 
00016  *
00017  * This file is copyright 2002 Jim Kent, but license is hereby
00018  * granted for all use - public, private or commercial. */
00019 
00020 #ifndef AXT_H
00021 #define AXT_H
00022 
00023 #ifndef LINEFILE_H
00024 #include "linefile.h"
00025 #endif 
00026 
00027 #ifndef DNASEQ_H
00028 #include "dnaseq.h"
00029 #endif
00030 
00031 #ifndef CHAIN_H
00032 #include "chain.h"
00033 #endif
00034 
00035 struct axt
00036 /* This contains information about one xeno alignment. */
00037     {
00038     struct axt *next;
00039     char *qName;                /* Name of query sequence. */
00040     int qStart, qEnd;           /* Half open zero=based coordinates. */
00041     char qStrand;               /* Is this forward or reverse strand + or - */
00042     char *tName;                /* Name of target. */
00043     int tStart, tEnd;           /* Target coordinates. */
00044     char tStrand;               /* Target strand - currently always +. */
00045     int score;                  /* Score.  Zero for unknown.  Units arbitrary. */
00046     int symCount;               /* Size of alignments. */
00047     char *qSym, *tSym;          /* Alignments. */
00048     int frame;                  /* If non-zero then translation frame. */
00049     };
00050 
00051 void axtFree(struct axt **pEl);
00052 /* Free an axt. */
00053 
00054 void axtFreeList(struct axt **pList);
00055 /* Free a list of dynamically allocated axt's */
00056 
00057 struct axt *axtRead(struct lineFile *lf);
00058 /* Read in next record from .axt file and return it.
00059  * Returns NULL at EOF. */
00060 
00061 struct axt *axtReadWithPos(struct lineFile *lf, off_t *retOffset);
00062 /* Read next axt, and if retOffset is not-NULL, fill it with
00063  * offset of start of axt. */
00064 
00065 boolean axtCheck(struct axt *axt, struct lineFile *lf);
00066 /* Return FALSE if there's a problem with axt. */
00067 
00068 void axtWrite(struct axt *axt, FILE *f);
00069 /* Output axt to axt file. */
00070 
00071 int axtCmpQuery(const void *va, const void *vb);
00072 /* Compare to sort based on query position. */
00073 
00074 int axtCmpTarget(const void *va, const void *vb);
00075 /* Compare to sort based on target position. */
00076 
00077 int axtCmpScore(const void *va, const void *vb);
00078 /* Compare to sort based on score. */
00079 
00080 int axtCmpTargetScoreDesc(const void *va, const void *vb);
00081 /* Compare to sort based on target name and score descending. */
00082 
00083 struct axtScoreScheme
00084 /* A scoring scheme or DNA alignment. */
00085     {
00086     struct scoreMatrix *next;
00087     int matrix[256][256];   /* Look up with letters. */
00088     int gapOpen;        /* Gap open cost. */
00089     int gapExtend;      /* Gap extension. */
00090     char *extra;        /* extra parameters */
00091     };
00092 
00093 void axtScoreSchemeFree(struct axtScoreScheme **pObj);
00094 /* Free up score scheme. */
00095 
00096 struct axtScoreScheme *axtScoreSchemeDefault();
00097 /* Return default scoring scheme (after blastz).  Do NOT axtScoreSchemeFree
00098  * this. */
00099 
00100 struct axtScoreScheme *axtScoreSchemeSimpleDna(int match, int misMatch, int gapOpen, int gapExtend);
00101 /* Return a relatively simple scoring scheme for DNA. */
00102 
00103 struct axtScoreScheme *axtScoreSchemeRnaDefault();
00104 /* Return default scoring scheme for RNA/DNA alignments
00105  * within the same species.  Do NOT axtScoreSchemeFree
00106  * this. */
00107 
00108 struct axtScoreScheme *axtScoreSchemeFromBlastzMatrix(char *text, int gapOpen, int gapExtend);
00109 /* return scoring schema from a string in the following format */
00110 /* 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91 */
00111 
00112 struct axtScoreScheme *axtScoreSchemeRnaFill();
00113 /* Return scoreing scheme a little more relaxed than 
00114  * RNA/DNA defaults for filling in gaps. */
00115 
00116 struct axtScoreScheme *axtScoreSchemeProteinDefault();
00117 /* Returns default protein scoring scheme.  This is
00118  * scaled to be compatible with the blastz one.  Don't
00119  * axtScoreSchemeFree this. */
00120 
00121 struct axtScoreScheme *axtScoreSchemeProteinRead(char *fileName);
00122 /* read in blosum-like matrix */
00123 
00124 struct axtScoreScheme *axtScoreSchemeRead(char *fileName);
00125 /* Read in scoring scheme from file. Looks like
00126     A    C    G    T
00127     91 -114  -31 -123
00128     -114  100 -125  -31
00129     -31 -125  100 -114
00130     -123  -31 -114   91
00131     O = 400, E = 30
00132  * axtScoreSchemeFree this when done. */
00133 
00134 struct axtScoreScheme *axtScoreSchemeReadLf(struct lineFile *lf );
00135 /* Read in scoring scheme from file. Looks like
00136     A    C    G    T
00137     91 -114  -31 -123
00138     -114  100 -125  -31
00139     -31 -125  100 -114
00140     -123  -31 -114   91
00141     O = 400, E = 30
00142  * axtScoreSchemeFree this when done. */
00143 
00144 void axtScoreSchemeDnaWrite(struct axtScoreScheme *ss, FILE *f, char *name);
00145 /* output the score dna based score matrix in meta Data format to File f,
00146 name should be set to the name of the program that is using the matrix */
00147 
00148 int axtScoreSym(struct axtScoreScheme *ss, int symCount, char *qSym, char *tSym);
00149 /* Return score without setting up an axt structure. */
00150 
00151 int axtScore(struct axt *axt, struct axtScoreScheme *ss);
00152 /* Return calculated score of axt. */
00153 
00154 int axtScoreFilterRepeats(struct axt *axt, struct axtScoreScheme *ss);
00155 /* Return calculated score of axt. do not score gaps if they are repeat masked*/
00156 
00157 int axtScoreUngapped(struct axtScoreScheme *ss, char *q, char *t, int size);
00158 /* Score ungapped alignment. */
00159 
00160 int axtScoreDnaDefault(struct axt *axt);
00161 /* Score DNA-based axt using default scheme. */
00162 
00163 int axtScoreProteinDefault(struct axt *axt);
00164 /* Score protein-based axt using default scheme. */
00165 
00166 void axtSubsetOnT(struct axt *axt, int newStart, int newEnd, 
00167         struct axtScoreScheme *ss, FILE *f);
00168 /* Write out subset of axt that goes from newStart to newEnd
00169  * in target coordinates. */
00170 
00171 int axtTransPosToQ(struct axt *axt, int tPos);
00172 /* Convert from t to q coordinates */
00173 
00174 void axtSwap(struct axt *axt, int tSize, int qSize);
00175 /* Flip target and query on one axt. */
00176 
00177 struct axtBundle
00178 /* A bunch of axt's on the same query/target sequence. */
00179     {
00180     struct axtBundle *next;
00181     struct axt *axtList;        /* List of alignments. */
00182     int tSize;                  /* Size of target. */
00183     int qSize;                  /* Size of query. */
00184     };
00185 
00186 void axtBundleFree(struct axtBundle **pObj);
00187 /* Free a axtBundle. */
00188 
00189 void axtBundleFreeList(struct axtBundle **pList);
00190 /* Free a list of gfAxtBundles. */
00191 
00192 void axtBlastOut(struct axtBundle *abList, 
00193         int queryIx, boolean isProt, FILE *f, 
00194         char *databaseName, int databaseSeqCount, double databaseLetterCount, 
00195         char *blastType, char *ourId, double minIdentity);
00196 /* Output a bundle of axt's on the same query sequence in blast format.
00197  * The parameters in detail are:
00198  *   ab - the list of bundles of axt's. 
00199  *   f  - output file handle
00200  *   databaseSeqCount - number of sequences in database
00201  *   databaseLetterCount - number of bases or aa's in database
00202  *   blastType - blast/wublast/blast8/blast9/xml
00203  *   ourId - optional (may be NULL) thing to put in header
00204  */
00205 
00206 struct axt *axtAffine(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss);
00207 /* Return alignment if any of query and target using scoring scheme.  This does
00208  * dynamic program affine-gap based alignment.  It's not suitable for very large
00209  * sequences. */
00210 
00211 boolean axtAffineSmallEnough(double querySize, double targetSize);
00212 /* Return TRUE if it is reasonable to align sequences of given sizes
00213  * with axtAffine. */
00214 
00215 
00216 struct axt *axtAffine2Level(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss);
00217 /* 
00218 
00219    Return alignment if any of query and target using scoring scheme. 
00220    
00221    2Level uses an economical amount of ram and should work for large target sequences.
00222    
00223    If Q is query size and T is target size and M is memory size, then
00224    Total memory used M = 30*Q*sqrt(T).  When the target is much larger than the query
00225    this method saves ram, and average runtime is only 50% greater, or 1.5 QT.  
00226    If Q=5000 and T=245,522,847 for hg17 chr1, then M = 2.2 GB ram.  
00227    axtAffine would need M=3QT = 3.4 TB.
00228    Of course massive alignments will be painfully slow anyway.
00229 
00230    Works for protein as well as DNA given the correct scoreScheme.
00231   
00232    NOTES:
00233    Double-gap cost is equal to gap-extend cost, but gap-open would also work.
00234    On very large target, score integer may overflow.
00235    Input sequences not checked for invalid chars.
00236    Input not checked but query should be shorter than target.
00237    
00238 */
00239 
00240 void axtAddBlocksToBoxInList(struct cBlock **pList, struct axt *axt);
00241 /* Add blocks (gapless subalignments) from (non-NULL!) axt to block list. 
00242  * Note: list will be in reverse order of axt blocks. */
00243 
00244 void axtPrintTraditional(struct axt *axt, int maxLine, struct axtScoreScheme *ss, 
00245         FILE *f);
00246 /* Print out an alignment with line-breaks. */
00247 
00248 double axtIdWithGaps(struct axt *axt);
00249 /* Return ratio of matching bases to total symbols in alignment. */
00250 
00251 double axtCoverage(struct axt *axt, int qSize, int tSize);
00252 /* Return % of q and t covered. */
00253 
00254 #endif /* AXT_H */
00255 

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2