inc/psl.h

Go to the documentation of this file.
00001 /* psl.h was originally generated by the autoSql program, which also 
00002  * generated psl.c and psl.sql.  This header links the database and 
00003  * the RAM representation of objects.   Additional functions were
00004  * added later. 
00005  *
00006  * This file is copyright 2002 Jim Kent, but license is hereby
00007  * granted for all use - public, private or commercial. */
00008 
00009 #ifndef PSL_H
00010 #define PSL_H
00011 
00012 #ifndef LOCALMEM_H
00013 #include "localmem.h"
00014 #endif 
00015 
00016 #ifndef LINEFILE_H
00017 #include "linefile.h"
00018 #endif
00019 
00020 #ifndef FUZZYFIND_H
00021 #include "fuzzyFind.h"
00022 #endif
00023 
00024 #ifndef DNASEQ_H
00025 #include "dnaseq.h"
00026 #endif
00027 
00028 /* Some forward declarations of structures used but not defined here. */
00029 struct rbTree;
00030 
00031 #define PSL_NUM_COLS  21  /* number of columns in a PSL */
00032 #define PSLX_NUM_COLS 23  /* number of columns in a PSLX */
00033 
00034 /* Options to pslGetCreateSql */
00035 #define PSL_TNAMEIX   0x01  /* create target name index */
00036 #define PSL_WITH_BIN  0x02  /* add bin column */
00037 #define PSL_XA_FORMAT 0x04  /* add XA format columns */
00038 
00039 /* options for pslFromAlign */
00040 #define PSL_IS_SOFTMASK 0x01 /* lower case are mask */
00041 
00042 struct psl
00043 /* Summary info about a patSpace alignment */
00044     {
00045     struct psl *next;  /* Next in singly linked list. */
00046     unsigned match;     /* Number of bases that match that aren't repeats */
00047     unsigned misMatch;  /* Number of bases that don't match */
00048     unsigned repMatch;  /* Number of bases that match but are part of repeats */
00049     unsigned nCount;    /* Number of 'N' bases */
00050     unsigned qNumInsert;        /* Number of inserts in query */
00051     int qBaseInsert;    /* Number of bases inserted in query */
00052     unsigned tNumInsert;        /* Number of inserts in target */
00053     int tBaseInsert;    /* Number of bases inserted in target */
00054     char strand[3];     /* + or - for strand */
00055     char *qName;        /* Query sequence name */
00056     unsigned qSize;     /* Query sequence size */
00057     int qStart; /* Alignment start position in query */
00058     int qEnd;   /* Alignment end position in query */
00059     char *tName;        /* Target sequence name */
00060     unsigned tSize;     /* Target sequence size */
00061     int tStart; /* Alignment start position in target */
00062     int tEnd;   /* Alignment end position in target */
00063     unsigned blockCount;        /* Number of blocks in alignment */
00064     unsigned *blockSizes;       /* Size of each block */
00065     unsigned *qStarts;  /* Start of each block in query. */
00066     unsigned *tStarts;  /* Start of each block in target. */
00067 
00068     char **qSequence;  /* query sequence for each block */
00069     char **tSequence;  /* target sequence for each block */
00070     };
00071 
00072 struct psl *pslxLoad(char **row);
00073 /* Load a pslx from row fetched with select * from psl
00074  * from database.  Dispose of this with pslFree(). */
00075 
00076 struct psl *pslLoad(char **row);
00077 /* Load a psl from row fetched with select * from psl
00078  * from database.  Dispose of this with pslFree(). */
00079 
00080 struct psl *pslCommaIn(char **pS, struct psl *ret);
00081 /* Create a psl out of a comma separated string. 
00082  * This will fill in ret if non-null, otherwise will
00083  * return a new psl */
00084 
00085 void pslFree(struct psl **pEl);
00086 /* Free a single dynamically allocated psl such as created
00087  * with pslLoad(). */
00088 
00089 void pslFreeList(struct psl **pList);
00090 /* Free a list of dynamically allocated psl's */
00091 
00092 void pslOutput(struct psl *el, FILE *f, char sep, char lastSep);
00093 /* Print out psl.  Separate fields with sep. Follow last field with lastSep. */
00094 
00095 #define pslTabOut(el,f) pslOutput(el,f,'\t','\n')
00096 /* Print out psl as a line in a tab-separated file. */
00097 
00098 #define pslCommaOut(el,f) pslOutput(el,f,',',',')
00099 /* Print out psl as a comma separated list including final comma. */
00100 
00101 /* ----- end autoSql generated part --------------- */
00102 
00103 void pslOutFormat(struct psl *el, FILE *f, char sep, char lastSep);
00104 /* Print out selected psl values.  Separate fields with sep. Follow last field with lastSep. */
00105 /* Prints out a better format with bold field headings followed by value */
00106 /* Requires further upstream work to ensure that only the field headers */
00107 /* declared here are printed if replacing an existing psl print function*/
00108 
00109 struct psl *pslLoadAll(char *fileName);
00110 /* Load all psl's in file. */
00111 
00112 struct psl *pslNext(struct lineFile *lf);
00113 /* Read next line from file and convert it to psl.  Return
00114  * NULL at eof. */
00115 
00116 struct psl *pslxLoadLm(char **row, struct lm *lm);
00117 /* Load row into local memory pslx. */
00118 
00119 struct psl *pslLoadLm(char **row, struct lm *lm);
00120 /* Load row into local memory psl. */
00121 
00122 void pslWriteHead(FILE *f);
00123 /* Write head of psl. */
00124 
00125 void pslxWriteHead(FILE *f, enum gfType qType, enum gfType tType);
00126 /* Write head of pslx (extended psl). */
00127 
00128 void pslWriteAll(struct psl *pslList, char *fileName, boolean writeHeader);
00129 /* Write a psl file from list. */
00130 
00131 struct lineFile *pslFileOpen(char *fileName);
00132 /* Read header part of psl and make sure it's right. 
00133  * Return line file handle to it. */
00134 
00135 struct lineFile *pslFileOpenWithMeta(char *fileName, FILE *f);
00136 /* Read header part of psl and make sure it's right. 
00137  * Return line file handle to it and send meta data to output file f */
00138 
00139 struct lineFile *pslFileOpenWithUniqueMeta(char *fileName, FILE *f);
00140 /* Read header part of psl and make sure it's right. 
00141 * Set flag to suppress duplicate header comments.
00142 * Return line file handle to it. */
00143 
00144 void pslxFileOpen(char *fileName, enum gfType *retQueryType, 
00145         enum gfType *retTargetType, struct lineFile **retLf);
00146 /* Read header part of psl and make sure it's right.  Return
00147  * sequence types and file handle. */
00148 
00149 void pslxFileOpenWithMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f);
00150 /* Read header part of psl and make sure it's right.  Return
00151  * sequence types and file handle and send meta data to output file f */
00152 
00153 void pslxFileOpenWithUniqueMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f);
00154 /* Read header part of psl and make sure it's right.  Return
00155 * sequence types and file handle and send only unique meta data to output f */
00156 
00157 int pslCmpQuery(const void *va, const void *vb);
00158 /* Compare to sort based on query. */
00159 
00160 int pslCmpTarget(const void *va, const void *vb);
00161 /* Compare to sort based on target. */
00162 
00163 int pslCmpTargetAndStrand(const void *va, const void *vb);
00164 /* Compare to sort based on target, strand,  tStart. */
00165 
00166 int pslCmpScore(const void *va, const void *vb);
00167 /* Compare to sort based on score (descending). */
00168 
00169 int pslCmpQueryScore(const void *va, const void *vb);
00170 /* Compare to sort based on query then score (descending). */
00171 
00172 int pslCalcMilliBad(struct psl *psl, boolean isMrna);
00173 /* Calculate badness in parts per thousand. */
00174 
00175 int pslCmpScoreDesc(const void *va, const void *vb);
00176 /* Compare to sort based on score descending. */
00177 
00178 int pslCmpMatch(const void *va, const void *vb);
00179 /* Compare to sort based on match. */
00180 
00181 int pslScore(const struct psl *psl);
00182 /* Return score for psl. */
00183 
00184 struct ffAli *pslToFfAli(struct psl *psl, struct dnaSeq *query, struct dnaSeq *target,
00185         int targetOffset);
00186 /* Convert from psl to ffAli format. */
00187 
00188 struct ffAli *pslToFakeFfAli(struct psl *psl, DNA *needle, DNA *haystack);
00189 /* Convert from psl to ffAli format.  In some cases you can pass NULL
00190  * for needle and haystack - depending what the post-processing is going
00191  * to be. */
00192 
00193 struct psl *pslFromFakeFfAli(struct ffAli *ff, 
00194         DNA *needle, DNA *haystack, char strand,
00195         char *qName, int qSize, char *tName, int tSize);
00196 /* This will create a basic psl structure from a sorted series of ffAli
00197  * blocks.  The fields that would need actual sequence to be filled in
00198  * are left zero however - fields including match, repMatch, mismatch. */
00199 
00200 int pslOrientation(struct psl *psl);
00201 /* Translate psl strand + or - to orientation +1 or -1 */
00202 
00203 /* marcos to get query and target strand.  Target returns implied + when
00204  * it's not specific  */
00205 #define pslQStrand(p) ((p)->strand[0])
00206 #define pslTStrand(p) (((p)->strand[1] != '-') ? '+' : '-')
00207 
00208 int pslWeightedIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset);
00209 /* Return >0 if introns make it look like alignment is on + strand,
00210  *        <0 if introns make it look like alignment is on - strand,
00211  *        0 if can't tell.  The absolute value of the return indicates
00212  * how many splice sites we've seen supporting the orientation.
00213  * Sequence should NOT be reverse complemented.  */
00214 
00215 int pslIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset);
00216 /* Return 1 if introns make it look like alignment is on + strand,
00217  *       -1 if introns make it look like alignment is on - strand,
00218  *        0 if can't tell.
00219  * Sequence should NOT be reverse complemented.  */
00220 
00221 boolean pslHasIntron(struct psl *psl, struct dnaSeq *seq, int seqOffset);
00222 /* Return TRUE if there's a probable intron. Sequence should NOT be
00223  * reverse complemented. */
00224 
00225 void pslTailSizes(struct psl *psl, int *retStartTail, int *retEndTail);
00226 /* Find the length of "tails" (rather than extensions) implied by psl. */
00227 
00228 void pslRcBoth(struct psl *psl);
00229 /* Swap around things in psl so it works as if the alignment
00230  * was done on the reverse strand of the target. */
00231 
00232 void pslRc(struct psl *psl);
00233 /* reverse-complement a PSL alignment.  This makes target strand explicit. */
00234 
00235 void pslSwap(struct psl *psl, boolean noRc);
00236 /* swap query and target in psl.  If noRc is TRUE, don't reverse-complement
00237  * PSL if needed, instead make target strand explict. */
00238 
00239 void pslTargetOffset(struct psl *psl, int offset);
00240 /* Add offset to target positions in psl. */
00241 
00242 void pslDump(struct psl *psl, FILE *f);
00243 /* Dump most of PSL to file - for debugging. */
00244 
00245 struct psl *pslTrimToTargetRange(struct psl *oldPsl, int tMin, int tMax);
00246 /* Return psl trimmed to fit inside tMin/tMax.  Note this does not
00247  * update the match/misMatch and related fields. */
00248 
00249 struct psl *pslTrimToQueryRange(struct psl *oldPsl, int qMin, int qMax);
00250 /* Return psl trimmed to fit inside qMin/qMax.  Note this does not
00251  * update the match/misMatch and related fields. */
00252 
00253 char* pslGetCreateSql(char* table, unsigned options, int tNameIdxLen);
00254 /* Get SQL required to create PSL table.  Options is a bit set consisting
00255  * of PSL_TNAMEIX, PSL_WITH_BIN, and PSL_XA_FORMAT.  tNameIdxLen is
00256  * the number of characters in target name to index.  If greater than
00257  * zero, must specify PSL_TNAMEIX.  If zero and PSL_TNAMEIX is specified,
00258  * to will default to 8. */
00259 
00260 int pslCheck(char *pslDesc, FILE* out, struct psl* psl);
00261 /* Validate a PSL for consistency.  pslDesc is printed the error messages
00262  * to file out (open /dev/null to discard). Return count of errors. */
00263 
00264 int pslCountBlocks(struct psl *target, struct psl *query, int maxBlockGap);
00265 /* count the number of blocks in the query that overlap the target */
00266 /* merge blocks that are closer than maxBlockGap */
00267 
00268 struct hash *readPslToBinKeeper(char *sizeFileName, char *pslFileName);
00269 /* read a list of psls and return results in hash of binKeeper structure for fast query*/
00270 
00271 boolean pslIsProtein(const struct psl *psl);
00272 /* is psl a protein psl (are it's blockSizes and scores in protein space) */
00273 
00274 struct psl* pslFromAlign(char *qName, int qSize, int qStart, int qEnd, char *qString,
00275                          char *tName, int tSize, int tStart, int tEnd, char *tString,
00276                          char* strand, unsigned options);
00277 /* Create a PSL from an alignment.  Options PSL_IS_SOFTMASK if lower case
00278  * bases indicate repeat masking.  Returns NULL if alignment is empty after
00279  * triming leading and trailing indels.*/
00280 
00281 int pslShowAlignment(struct psl *psl, boolean isProt,
00282         char *qName, bioSeq *qSeq, int qStart, int qEnd,
00283         char *tName, bioSeq *tSeq, int tStart, int tEnd, FILE *f);
00284 /* Show protein/DNA alignment or translated DNA alignment in HTML format. */
00285 
00286 int pslGenoShowAlignment(struct psl *psl, boolean isProt,
00287                       char *qName, bioSeq *qSeq, int qStart, int qEnd,
00288                       char *tName, bioSeq *tSeq, int tStart, int tEnd, int exnStarts[], int exnEnds[], int exnCnt, FILE *f);
00289 /* Show protein/DNA alignment or translated DNA alignment in HTML format. */
00290 
00291 struct psl* pslNew(char *qName, unsigned qSize, int qStart, int qEnd,
00292                    char *tName, unsigned tSize, int tStart, int tEnd,
00293                    char *strand, unsigned blockSpace, unsigned opts);
00294 /* create a new psl with space for the specified number of blocks allocated.
00295  * pslGrow maybe used to expand this space if needed.  Valid options are
00296  * PSL_XA_FORMAT. */
00297 
00298 void pslGrow(struct psl *psl, int *blockSpacePtr);
00299 /* Increase memory allocated to a psl to hold more blocks.  blockSpacePtr
00300  * should point the the current maximum number of blocks and will be
00301  * updated to with the new amount of space. */
00302 
00303 int pslRangeTreeOverlap(struct psl *psl, struct rbTree *rangeTree);
00304 /* Return amount that psl overlaps (on target side) with rangeTree. */
00305 
00306 #endif /* PSL_H */
00307 

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2