00001 /* psl.h was originally generated by the autoSql program, which also 00002 * generated psl.c and psl.sql. This header links the database and 00003 * the RAM representation of objects. Additional functions were 00004 * added later. 00005 * 00006 * This file is copyright 2002 Jim Kent, but license is hereby 00007 * granted for all use - public, private or commercial. */ 00008 00009 #ifndef PSL_H 00010 #define PSL_H 00011 00012 #ifndef LOCALMEM_H 00013 #include "localmem.h" 00014 #endif 00015 00016 #ifndef LINEFILE_H 00017 #include "linefile.h" 00018 #endif 00019 00020 #ifndef FUZZYFIND_H 00021 #include "fuzzyFind.h" 00022 #endif 00023 00024 #ifndef DNASEQ_H 00025 #include "dnaseq.h" 00026 #endif 00027 00028 /* Some forward declarations of structures used but not defined here. */ 00029 struct rbTree; 00030 00031 #define PSL_NUM_COLS 21 /* number of columns in a PSL */ 00032 #define PSLX_NUM_COLS 23 /* number of columns in a PSLX */ 00033 00034 /* Options to pslGetCreateSql */ 00035 #define PSL_TNAMEIX 0x01 /* create target name index */ 00036 #define PSL_WITH_BIN 0x02 /* add bin column */ 00037 #define PSL_XA_FORMAT 0x04 /* add XA format columns */ 00038 00039 /* options for pslFromAlign */ 00040 #define PSL_IS_SOFTMASK 0x01 /* lower case are mask */ 00041 00042 struct psl 00043 /* Summary info about a patSpace alignment */ 00044 { 00045 struct psl *next; /* Next in singly linked list. */ 00046 unsigned match; /* Number of bases that match that aren't repeats */ 00047 unsigned misMatch; /* Number of bases that don't match */ 00048 unsigned repMatch; /* Number of bases that match but are part of repeats */ 00049 unsigned nCount; /* Number of 'N' bases */ 00050 unsigned qNumInsert; /* Number of inserts in query */ 00051 int qBaseInsert; /* Number of bases inserted in query */ 00052 unsigned tNumInsert; /* Number of inserts in target */ 00053 int tBaseInsert; /* Number of bases inserted in target */ 00054 char strand[3]; /* + or - for strand */ 00055 char *qName; /* Query sequence name */ 00056 unsigned qSize; /* Query sequence size */ 00057 int qStart; /* Alignment start position in query */ 00058 int qEnd; /* Alignment end position in query */ 00059 char *tName; /* Target sequence name */ 00060 unsigned tSize; /* Target sequence size */ 00061 int tStart; /* Alignment start position in target */ 00062 int tEnd; /* Alignment end position in target */ 00063 unsigned blockCount; /* Number of blocks in alignment */ 00064 unsigned *blockSizes; /* Size of each block */ 00065 unsigned *qStarts; /* Start of each block in query. */ 00066 unsigned *tStarts; /* Start of each block in target. */ 00067 00068 char **qSequence; /* query sequence for each block */ 00069 char **tSequence; /* target sequence for each block */ 00070 }; 00071 00072 struct psl *pslxLoad(char **row); 00073 /* Load a pslx from row fetched with select * from psl 00074 * from database. Dispose of this with pslFree(). */ 00075 00076 struct psl *pslLoad(char **row); 00077 /* Load a psl from row fetched with select * from psl 00078 * from database. Dispose of this with pslFree(). */ 00079 00080 struct psl *pslCommaIn(char **pS, struct psl *ret); 00081 /* Create a psl out of a comma separated string. 00082 * This will fill in ret if non-null, otherwise will 00083 * return a new psl */ 00084 00085 void pslFree(struct psl **pEl); 00086 /* Free a single dynamically allocated psl such as created 00087 * with pslLoad(). */ 00088 00089 void pslFreeList(struct psl **pList); 00090 /* Free a list of dynamically allocated psl's */ 00091 00092 void pslOutput(struct psl *el, FILE *f, char sep, char lastSep); 00093 /* Print out psl. Separate fields with sep. Follow last field with lastSep. */ 00094 00095 #define pslTabOut(el,f) pslOutput(el,f,'\t','\n') 00096 /* Print out psl as a line in a tab-separated file. */ 00097 00098 #define pslCommaOut(el,f) pslOutput(el,f,',',',') 00099 /* Print out psl as a comma separated list including final comma. */ 00100 00101 /* ----- end autoSql generated part --------------- */ 00102 00103 void pslOutFormat(struct psl *el, FILE *f, char sep, char lastSep); 00104 /* Print out selected psl values. Separate fields with sep. Follow last field with lastSep. */ 00105 /* Prints out a better format with bold field headings followed by value */ 00106 /* Requires further upstream work to ensure that only the field headers */ 00107 /* declared here are printed if replacing an existing psl print function*/ 00108 00109 struct psl *pslLoadAll(char *fileName); 00110 /* Load all psl's in file. */ 00111 00112 struct psl *pslNext(struct lineFile *lf); 00113 /* Read next line from file and convert it to psl. Return 00114 * NULL at eof. */ 00115 00116 struct psl *pslxLoadLm(char **row, struct lm *lm); 00117 /* Load row into local memory pslx. */ 00118 00119 struct psl *pslLoadLm(char **row, struct lm *lm); 00120 /* Load row into local memory psl. */ 00121 00122 void pslWriteHead(FILE *f); 00123 /* Write head of psl. */ 00124 00125 void pslxWriteHead(FILE *f, enum gfType qType, enum gfType tType); 00126 /* Write head of pslx (extended psl). */ 00127 00128 void pslWriteAll(struct psl *pslList, char *fileName, boolean writeHeader); 00129 /* Write a psl file from list. */ 00130 00131 struct lineFile *pslFileOpen(char *fileName); 00132 /* Read header part of psl and make sure it's right. 00133 * Return line file handle to it. */ 00134 00135 struct lineFile *pslFileOpenWithMeta(char *fileName, FILE *f); 00136 /* Read header part of psl and make sure it's right. 00137 * Return line file handle to it and send meta data to output file f */ 00138 00139 struct lineFile *pslFileOpenWithUniqueMeta(char *fileName, FILE *f); 00140 /* Read header part of psl and make sure it's right. 00141 * Set flag to suppress duplicate header comments. 00142 * Return line file handle to it. */ 00143 00144 void pslxFileOpen(char *fileName, enum gfType *retQueryType, 00145 enum gfType *retTargetType, struct lineFile **retLf); 00146 /* Read header part of psl and make sure it's right. Return 00147 * sequence types and file handle. */ 00148 00149 void pslxFileOpenWithMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f); 00150 /* Read header part of psl and make sure it's right. Return 00151 * sequence types and file handle and send meta data to output file f */ 00152 00153 void pslxFileOpenWithUniqueMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f); 00154 /* Read header part of psl and make sure it's right. Return 00155 * sequence types and file handle and send only unique meta data to output f */ 00156 00157 int pslCmpQuery(const void *va, const void *vb); 00158 /* Compare to sort based on query. */ 00159 00160 int pslCmpTarget(const void *va, const void *vb); 00161 /* Compare to sort based on target. */ 00162 00163 int pslCmpTargetAndStrand(const void *va, const void *vb); 00164 /* Compare to sort based on target, strand, tStart. */ 00165 00166 int pslCmpScore(const void *va, const void *vb); 00167 /* Compare to sort based on score (descending). */ 00168 00169 int pslCmpQueryScore(const void *va, const void *vb); 00170 /* Compare to sort based on query then score (descending). */ 00171 00172 int pslCalcMilliBad(struct psl *psl, boolean isMrna); 00173 /* Calculate badness in parts per thousand. */ 00174 00175 int pslCmpScoreDesc(const void *va, const void *vb); 00176 /* Compare to sort based on score descending. */ 00177 00178 int pslCmpMatch(const void *va, const void *vb); 00179 /* Compare to sort based on match. */ 00180 00181 int pslScore(const struct psl *psl); 00182 /* Return score for psl. */ 00183 00184 struct ffAli *pslToFfAli(struct psl *psl, struct dnaSeq *query, struct dnaSeq *target, 00185 int targetOffset); 00186 /* Convert from psl to ffAli format. */ 00187 00188 struct ffAli *pslToFakeFfAli(struct psl *psl, DNA *needle, DNA *haystack); 00189 /* Convert from psl to ffAli format. In some cases you can pass NULL 00190 * for needle and haystack - depending what the post-processing is going 00191 * to be. */ 00192 00193 struct psl *pslFromFakeFfAli(struct ffAli *ff, 00194 DNA *needle, DNA *haystack, char strand, 00195 char *qName, int qSize, char *tName, int tSize); 00196 /* This will create a basic psl structure from a sorted series of ffAli 00197 * blocks. The fields that would need actual sequence to be filled in 00198 * are left zero however - fields including match, repMatch, mismatch. */ 00199 00200 int pslOrientation(struct psl *psl); 00201 /* Translate psl strand + or - to orientation +1 or -1 */ 00202 00203 /* marcos to get query and target strand. Target returns implied + when 00204 * it's not specific */ 00205 #define pslQStrand(p) ((p)->strand[0]) 00206 #define pslTStrand(p) (((p)->strand[1] != '-') ? '+' : '-') 00207 00208 int pslWeightedIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset); 00209 /* Return >0 if introns make it look like alignment is on + strand, 00210 * <0 if introns make it look like alignment is on - strand, 00211 * 0 if can't tell. The absolute value of the return indicates 00212 * how many splice sites we've seen supporting the orientation. 00213 * Sequence should NOT be reverse complemented. */ 00214 00215 int pslIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset); 00216 /* Return 1 if introns make it look like alignment is on + strand, 00217 * -1 if introns make it look like alignment is on - strand, 00218 * 0 if can't tell. 00219 * Sequence should NOT be reverse complemented. */ 00220 00221 boolean pslHasIntron(struct psl *psl, struct dnaSeq *seq, int seqOffset); 00222 /* Return TRUE if there's a probable intron. Sequence should NOT be 00223 * reverse complemented. */ 00224 00225 void pslTailSizes(struct psl *psl, int *retStartTail, int *retEndTail); 00226 /* Find the length of "tails" (rather than extensions) implied by psl. */ 00227 00228 void pslRcBoth(struct psl *psl); 00229 /* Swap around things in psl so it works as if the alignment 00230 * was done on the reverse strand of the target. */ 00231 00232 void pslRc(struct psl *psl); 00233 /* reverse-complement a PSL alignment. This makes target strand explicit. */ 00234 00235 void pslSwap(struct psl *psl, boolean noRc); 00236 /* swap query and target in psl. If noRc is TRUE, don't reverse-complement 00237 * PSL if needed, instead make target strand explict. */ 00238 00239 void pslTargetOffset(struct psl *psl, int offset); 00240 /* Add offset to target positions in psl. */ 00241 00242 void pslDump(struct psl *psl, FILE *f); 00243 /* Dump most of PSL to file - for debugging. */ 00244 00245 struct psl *pslTrimToTargetRange(struct psl *oldPsl, int tMin, int tMax); 00246 /* Return psl trimmed to fit inside tMin/tMax. Note this does not 00247 * update the match/misMatch and related fields. */ 00248 00249 struct psl *pslTrimToQueryRange(struct psl *oldPsl, int qMin, int qMax); 00250 /* Return psl trimmed to fit inside qMin/qMax. Note this does not 00251 * update the match/misMatch and related fields. */ 00252 00253 char* pslGetCreateSql(char* table, unsigned options, int tNameIdxLen); 00254 /* Get SQL required to create PSL table. Options is a bit set consisting 00255 * of PSL_TNAMEIX, PSL_WITH_BIN, and PSL_XA_FORMAT. tNameIdxLen is 00256 * the number of characters in target name to index. If greater than 00257 * zero, must specify PSL_TNAMEIX. If zero and PSL_TNAMEIX is specified, 00258 * to will default to 8. */ 00259 00260 int pslCheck(char *pslDesc, FILE* out, struct psl* psl); 00261 /* Validate a PSL for consistency. pslDesc is printed the error messages 00262 * to file out (open /dev/null to discard). Return count of errors. */ 00263 00264 int pslCountBlocks(struct psl *target, struct psl *query, int maxBlockGap); 00265 /* count the number of blocks in the query that overlap the target */ 00266 /* merge blocks that are closer than maxBlockGap */ 00267 00268 struct hash *readPslToBinKeeper(char *sizeFileName, char *pslFileName); 00269 /* read a list of psls and return results in hash of binKeeper structure for fast query*/ 00270 00271 boolean pslIsProtein(const struct psl *psl); 00272 /* is psl a protein psl (are it's blockSizes and scores in protein space) */ 00273 00274 struct psl* pslFromAlign(char *qName, int qSize, int qStart, int qEnd, char *qString, 00275 char *tName, int tSize, int tStart, int tEnd, char *tString, 00276 char* strand, unsigned options); 00277 /* Create a PSL from an alignment. Options PSL_IS_SOFTMASK if lower case 00278 * bases indicate repeat masking. Returns NULL if alignment is empty after 00279 * triming leading and trailing indels.*/ 00280 00281 int pslShowAlignment(struct psl *psl, boolean isProt, 00282 char *qName, bioSeq *qSeq, int qStart, int qEnd, 00283 char *tName, bioSeq *tSeq, int tStart, int tEnd, FILE *f); 00284 /* Show protein/DNA alignment or translated DNA alignment in HTML format. */ 00285 00286 int pslGenoShowAlignment(struct psl *psl, boolean isProt, 00287 char *qName, bioSeq *qSeq, int qStart, int qEnd, 00288 char *tName, bioSeq *tSeq, int tStart, int tEnd, int exnStarts[], int exnEnds[], int exnCnt, FILE *f); 00289 /* Show protein/DNA alignment or translated DNA alignment in HTML format. */ 00290 00291 struct psl* pslNew(char *qName, unsigned qSize, int qStart, int qEnd, 00292 char *tName, unsigned tSize, int tStart, int tEnd, 00293 char *strand, unsigned blockSpace, unsigned opts); 00294 /* create a new psl with space for the specified number of blocks allocated. 00295 * pslGrow maybe used to expand this space if needed. Valid options are 00296 * PSL_XA_FORMAT. */ 00297 00298 void pslGrow(struct psl *psl, int *blockSpacePtr); 00299 /* Increase memory allocated to a psl to hold more blocks. blockSpacePtr 00300 * should point the the current maximum number of blocks and will be 00301 * updated to with the new amount of space. */ 00302 00303 int pslRangeTreeOverlap(struct psl *psl, struct rbTree *rangeTree); 00304 /* Return amount that psl overlaps (on target side) with rangeTree. */ 00305 00306 #endif /* PSL_H */ 00307
1.5.2