00001 /* AXT - A simple alignment format with four lines per 00002 * alignment. The first specifies the names of the 00003 * two sequences that align and the position of the 00004 * alignment, as well as the strand. The next two 00005 * lines contain the alignment itself including dashes 00006 * for inserts. The alignment is separated from the 00007 * next alignment with a blank line. 00008 * 00009 * This file contains routines to read such alignments. 00010 * Note that though the coordinates are one based and 00011 * closed on disk, they get converted to our usual half 00012 * open zero based in memory. 00013 * 00014 * This also contains code for simple DNA alignment scoring 00015 * schemes. 00016 * 00017 * This file is copyright 2002 Jim Kent, but license is hereby 00018 * granted for all use - public, private or commercial. */ 00019 00020 #ifndef AXT_H 00021 #define AXT_H 00022 00023 #ifndef LINEFILE_H 00024 #include "linefile.h" 00025 #endif 00026 00027 #ifndef DNASEQ_H 00028 #include "dnaseq.h" 00029 #endif 00030 00031 #ifndef CHAIN_H 00032 #include "chain.h" 00033 #endif 00034 00035 struct axt 00036 /* This contains information about one xeno alignment. */ 00037 { 00038 struct axt *next; 00039 char *qName; /* Name of query sequence. */ 00040 int qStart, qEnd; /* Half open zero=based coordinates. */ 00041 char qStrand; /* Is this forward or reverse strand + or - */ 00042 char *tName; /* Name of target. */ 00043 int tStart, tEnd; /* Target coordinates. */ 00044 char tStrand; /* Target strand - currently always +. */ 00045 int score; /* Score. Zero for unknown. Units arbitrary. */ 00046 int symCount; /* Size of alignments. */ 00047 char *qSym, *tSym; /* Alignments. */ 00048 int frame; /* If non-zero then translation frame. */ 00049 }; 00050 00051 void axtFree(struct axt **pEl); 00052 /* Free an axt. */ 00053 00054 void axtFreeList(struct axt **pList); 00055 /* Free a list of dynamically allocated axt's */ 00056 00057 struct axt *axtRead(struct lineFile *lf); 00058 /* Read in next record from .axt file and return it. 00059 * Returns NULL at EOF. */ 00060 00061 struct axt *axtReadWithPos(struct lineFile *lf, off_t *retOffset); 00062 /* Read next axt, and if retOffset is not-NULL, fill it with 00063 * offset of start of axt. */ 00064 00065 boolean axtCheck(struct axt *axt, struct lineFile *lf); 00066 /* Return FALSE if there's a problem with axt. */ 00067 00068 void axtWrite(struct axt *axt, FILE *f); 00069 /* Output axt to axt file. */ 00070 00071 int axtCmpQuery(const void *va, const void *vb); 00072 /* Compare to sort based on query position. */ 00073 00074 int axtCmpTarget(const void *va, const void *vb); 00075 /* Compare to sort based on target position. */ 00076 00077 int axtCmpScore(const void *va, const void *vb); 00078 /* Compare to sort based on score. */ 00079 00080 int axtCmpTargetScoreDesc(const void *va, const void *vb); 00081 /* Compare to sort based on target name and score descending. */ 00082 00083 struct axtScoreScheme 00084 /* A scoring scheme or DNA alignment. */ 00085 { 00086 struct scoreMatrix *next; 00087 int matrix[256][256]; /* Look up with letters. */ 00088 int gapOpen; /* Gap open cost. */ 00089 int gapExtend; /* Gap extension. */ 00090 char *extra; /* extra parameters */ 00091 }; 00092 00093 void axtScoreSchemeFree(struct axtScoreScheme **pObj); 00094 /* Free up score scheme. */ 00095 00096 struct axtScoreScheme *axtScoreSchemeDefault(); 00097 /* Return default scoring scheme (after blastz). Do NOT axtScoreSchemeFree 00098 * this. */ 00099 00100 struct axtScoreScheme *axtScoreSchemeSimpleDna(int match, int misMatch, int gapOpen, int gapExtend); 00101 /* Return a relatively simple scoring scheme for DNA. */ 00102 00103 struct axtScoreScheme *axtScoreSchemeRnaDefault(); 00104 /* Return default scoring scheme for RNA/DNA alignments 00105 * within the same species. Do NOT axtScoreSchemeFree 00106 * this. */ 00107 00108 struct axtScoreScheme *axtScoreSchemeFromBlastzMatrix(char *text, int gapOpen, int gapExtend); 00109 /* return scoring schema from a string in the following format */ 00110 /* 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91 */ 00111 00112 struct axtScoreScheme *axtScoreSchemeRnaFill(); 00113 /* Return scoreing scheme a little more relaxed than 00114 * RNA/DNA defaults for filling in gaps. */ 00115 00116 struct axtScoreScheme *axtScoreSchemeProteinDefault(); 00117 /* Returns default protein scoring scheme. This is 00118 * scaled to be compatible with the blastz one. Don't 00119 * axtScoreSchemeFree this. */ 00120 00121 struct axtScoreScheme *axtScoreSchemeProteinRead(char *fileName); 00122 /* read in blosum-like matrix */ 00123 00124 struct axtScoreScheme *axtScoreSchemeRead(char *fileName); 00125 /* Read in scoring scheme from file. Looks like 00126 A C G T 00127 91 -114 -31 -123 00128 -114 100 -125 -31 00129 -31 -125 100 -114 00130 -123 -31 -114 91 00131 O = 400, E = 30 00132 * axtScoreSchemeFree this when done. */ 00133 00134 struct axtScoreScheme *axtScoreSchemeReadLf(struct lineFile *lf ); 00135 /* Read in scoring scheme from file. Looks like 00136 A C G T 00137 91 -114 -31 -123 00138 -114 100 -125 -31 00139 -31 -125 100 -114 00140 -123 -31 -114 91 00141 O = 400, E = 30 00142 * axtScoreSchemeFree this when done. */ 00143 00144 void axtScoreSchemeDnaWrite(struct axtScoreScheme *ss, FILE *f, char *name); 00145 /* output the score dna based score matrix in meta Data format to File f, 00146 name should be set to the name of the program that is using the matrix */ 00147 00148 int axtScoreSym(struct axtScoreScheme *ss, int symCount, char *qSym, char *tSym); 00149 /* Return score without setting up an axt structure. */ 00150 00151 int axtScore(struct axt *axt, struct axtScoreScheme *ss); 00152 /* Return calculated score of axt. */ 00153 00154 int axtScoreFilterRepeats(struct axt *axt, struct axtScoreScheme *ss); 00155 /* Return calculated score of axt. do not score gaps if they are repeat masked*/ 00156 00157 int axtScoreUngapped(struct axtScoreScheme *ss, char *q, char *t, int size); 00158 /* Score ungapped alignment. */ 00159 00160 int axtScoreDnaDefault(struct axt *axt); 00161 /* Score DNA-based axt using default scheme. */ 00162 00163 int axtScoreProteinDefault(struct axt *axt); 00164 /* Score protein-based axt using default scheme. */ 00165 00166 void axtSubsetOnT(struct axt *axt, int newStart, int newEnd, 00167 struct axtScoreScheme *ss, FILE *f); 00168 /* Write out subset of axt that goes from newStart to newEnd 00169 * in target coordinates. */ 00170 00171 int axtTransPosToQ(struct axt *axt, int tPos); 00172 /* Convert from t to q coordinates */ 00173 00174 void axtSwap(struct axt *axt, int tSize, int qSize); 00175 /* Flip target and query on one axt. */ 00176 00177 struct axtBundle 00178 /* A bunch of axt's on the same query/target sequence. */ 00179 { 00180 struct axtBundle *next; 00181 struct axt *axtList; /* List of alignments. */ 00182 int tSize; /* Size of target. */ 00183 int qSize; /* Size of query. */ 00184 }; 00185 00186 void axtBundleFree(struct axtBundle **pObj); 00187 /* Free a axtBundle. */ 00188 00189 void axtBundleFreeList(struct axtBundle **pList); 00190 /* Free a list of gfAxtBundles. */ 00191 00192 void axtBlastOut(struct axtBundle *abList, 00193 int queryIx, boolean isProt, FILE *f, 00194 char *databaseName, int databaseSeqCount, double databaseLetterCount, 00195 char *blastType, char *ourId, double minIdentity); 00196 /* Output a bundle of axt's on the same query sequence in blast format. 00197 * The parameters in detail are: 00198 * ab - the list of bundles of axt's. 00199 * f - output file handle 00200 * databaseSeqCount - number of sequences in database 00201 * databaseLetterCount - number of bases or aa's in database 00202 * blastType - blast/wublast/blast8/blast9/xml 00203 * ourId - optional (may be NULL) thing to put in header 00204 */ 00205 00206 struct axt *axtAffine(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss); 00207 /* Return alignment if any of query and target using scoring scheme. This does 00208 * dynamic program affine-gap based alignment. It's not suitable for very large 00209 * sequences. */ 00210 00211 boolean axtAffineSmallEnough(double querySize, double targetSize); 00212 /* Return TRUE if it is reasonable to align sequences of given sizes 00213 * with axtAffine. */ 00214 00215 00216 struct axt *axtAffine2Level(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss); 00217 /* 00218 00219 Return alignment if any of query and target using scoring scheme. 00220 00221 2Level uses an economical amount of ram and should work for large target sequences. 00222 00223 If Q is query size and T is target size and M is memory size, then 00224 Total memory used M = 30*Q*sqrt(T). When the target is much larger than the query 00225 this method saves ram, and average runtime is only 50% greater, or 1.5 QT. 00226 If Q=5000 and T=245,522,847 for hg17 chr1, then M = 2.2 GB ram. 00227 axtAffine would need M=3QT = 3.4 TB. 00228 Of course massive alignments will be painfully slow anyway. 00229 00230 Works for protein as well as DNA given the correct scoreScheme. 00231 00232 NOTES: 00233 Double-gap cost is equal to gap-extend cost, but gap-open would also work. 00234 On very large target, score integer may overflow. 00235 Input sequences not checked for invalid chars. 00236 Input not checked but query should be shorter than target. 00237 00238 */ 00239 00240 void axtAddBlocksToBoxInList(struct cBlock **pList, struct axt *axt); 00241 /* Add blocks (gapless subalignments) from (non-NULL!) axt to block list. 00242 * Note: list will be in reverse order of axt blocks. */ 00243 00244 void axtPrintTraditional(struct axt *axt, int maxLine, struct axtScoreScheme *ss, 00245 FILE *f); 00246 /* Print out an alignment with line-breaks. */ 00247 00248 double axtIdWithGaps(struct axt *axt); 00249 /* Return ratio of matching bases to total symbols in alignment. */ 00250 00251 double axtCoverage(struct axt *axt, int qSize, int tSize); 00252 /* Return % of q and t covered. */ 00253 00254 #endif /* AXT_H */ 00255
1.5.2