inc/genoFind.h

Go to the documentation of this file.
00001 /* genoFind.h - Interface to modules for fast finding of sequence
00002  * matches. */
00003 /* Copyright 2001-2002 Jim Kent.  All rights reserved. */
00004 
00005 #ifndef GENOFIND_H
00006 #define GENOFIND_H
00007 
00008 #ifndef DNASEQ_H
00009 #include "dnaseq.h"
00010 #endif
00011 
00012 #ifndef FUZZYFIND_H
00013 #include "fuzzyFind.h"
00014 #endif
00015 
00016 #ifndef HASH_H
00017 #include "hash.h"
00018 #endif
00019 
00020 #ifndef ALITYPE_H
00021 #include "aliType.h"
00022 #endif
00023 
00024 #ifndef LOCALMEM_H
00025 #include "localmem.h"
00026 #endif 
00027 
00028 #ifndef BITS_H
00029 #include "bits.h"
00030 #endif
00031 
00032 #ifndef AXT_H
00033 #include "axt.h"
00034 #endif
00035 
00036 enum gfConstants {
00037     gfMinMatch = 2,
00038     gfMaxGap = 2,
00039     gfTileSize = 11,
00040     gfMaxTileUse = 1024,
00041     gfPepMaxTileUse = 30000,
00042 };
00043 
00044 struct gfSeqSource
00045 /* Where a block of sequence comes from. */
00046     {
00047     struct gfSeqSource *next;
00048     char *fileName;     /* Name of file. */
00049     bioSeq *seq;        /* Sequences.  Usually either this or fileName is NULL. */
00050     bits32 start,end;   /* Position within merged sequence. */
00051     Bits *maskedBits;   /* If non-null contains repeat-masking info. */
00052     };
00053 
00054 struct gfHit
00055 /* A genoFind hit. */
00056    {
00057    struct gfHit *next;
00058    bits32 qStart;               /* Where it hits in query. */
00059    bits32 tStart;               /* Where it hits in target. */
00060    bits32 diagonal;             /* tStart + qSize - qStart. */
00061    };
00062 
00063 /* gfHits are free'd with simple freeMem or slFreeList. */
00064 
00065 struct gfClump
00066 /* A clump of hits. */
00067 /* Note: for clumps from regular (blat) queries, tStart and tEnd include 
00068  * target->start, but for clumps from gfPcrClumps(), tStart and tEnd have 
00069  * already had target->start subtracted.  So tStart and tEnd in PCR clumps 
00070  * are relative to that target sequence (not the collection of all target 
00071  * sequences). */
00072     {
00073     struct gfClump *next;       /* Next clump. */
00074     bits32 qStart, qEnd;        /* Position in query. */
00075     struct gfSeqSource *target; /* Target source sequence. */
00076     bits32 tStart, tEnd;        /* Position in target. */
00077     int hitCount;               /* Number of hits. */
00078     struct gfHit *hitList;      /* List of hits. Not allocated here. */
00079     int queryCoverage;          /* Number of bases covered in query (thx AG!) */
00080     };
00081 
00082 void gfClumpFree(struct gfClump **pClump);
00083 /* Free a single clump. */
00084 
00085 void gfClumpFreeList(struct gfClump **pList);
00086 /* Free a list of dynamically allocated gfClump's */
00087 
00088 struct genoFind
00089 /* An index of all K-mers in the genome. */
00090     {
00091     int maxPat;                          /* Max # of times pattern can occur
00092                                           * before it is ignored. */
00093     int minMatch;                        /* Minimum number of tile hits needed
00094                                           * to trigger a clump hit. */
00095     int maxGap;                          /* Max gap between tiles in a clump. */
00096     int tileSize;                        /* Size of each N-mer. */
00097     int stepSize;                        /* Spacing between N-mers. */
00098     int tileSpaceSize;                   /* Number of N-mer values. */
00099     int tileMask;                        /* 1-s for each N-mer. */
00100     int sourceCount;                     /* Count of source files. */
00101     struct gfSeqSource *sources;         /* List of sequence sources. */
00102     bool isPep;                          /* Is a peptide. */
00103     bool allowOneMismatch;               /* Allow a single mismatch? */
00104     int segSize;                         /* Index is segmented if non-zero. */
00105     bits32 totalSeqSize;                 /* Total size of all sequences. */
00106     bits32 *listSizes;                   /* Size of list for each N-mer */
00107     void *allocated;                     /* Storage space for all lists. */
00108     bits32 **lists;                      /* A list for each N-mer. Used if
00109                                           * isSegmented is false. */
00110     bits16 **endLists;                   /* A more complex list for each N-mer.
00111                                           * Used if isSegmented is true.
00112                                           * Values come in groups of threes.
00113                                           * The first is the packed last few
00114                                           * letters of the tile.  The next two
00115                                           * are the offset in the genome.  This
00116                                           * would be a struct but that would take
00117                                           * 8 bytes instead of 6, or nearly an
00118                                           * extra gigabyte of RAM. */
00119     };
00120 
00121 void genoFindFree(struct genoFind **pGenoFind);
00122 /* Free up a genoFind index. */
00123 
00124 struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name);
00125 /* Find target of given name.  Return NULL if none. */
00126 
00127 /* ---  Stuff for saving results ---- */
00128 
00129 
00130 struct gfOutput
00131 /* A polymorphic object to help us write many file types. */
00132     {
00133     struct gfOutput *next;
00134     void *data;         /* Type-specific data pointer.  Must be freeMem'able */
00135     void (*out)(char *chromName, int chromSize, int chromOffset,
00136             struct ffAli *ali, bioSeq *tSeq, struct hash *t3Hash, bioSeq *qSeq, 
00137             boolean qIsRc, boolean tIsRc,
00138             enum ffStringency stringency, int minMatch, struct gfOutput *out);
00139     /* This is the type of a client provided function to save an alignment. 
00140      * The parameters are:
00141      *     chromName - name of target (aka genomic or database) sequence.
00142      *     chromSize - size of target sequence.
00143      *     chromOffset - offset of genoSequence in target.
00144      *     ffAli - alignment with pointers into tSeq/qSeq or in
00145      *             translated target case, into t3Hash.
00146      *     tSeq - part of target sequence in normal case.   In translated
00147      *             target case look at t3Hash instead.
00148      *     t3Hash - used only in translated target case.  A hash keyed by
00149      *             target sequence name with values *lists* of trans3 structures.
00150      *             This hash can be searched to find both the translated and
00151      *             untranslated versions of the bits of the target that are in 
00152      *             memory.  (You can assume at this point all parts needed for
00153      *             output are indeed in memory.)
00154      *     qSeq - query sequence (this isn't segmented at all). 
00155      *     isRc - True if query is reverse complemented.
00156      *     stringency - ffCdna, etc.  I'm hoping to move this elsewhere.
00157      *     minMatch - minimum score to output.  Also should be moved elsewhere.
00158      *     outputData - custom data for specific output function.
00159      * The interface is a bit complex - partly from the demands of translated
00160      * output, and partly from trying not to have the entire target sequence in
00161      * memory.
00162      */
00163     void (*queryOut)(struct gfOutput *out, FILE *f); 
00164     /* Called for each query */
00165 
00166     void (*fileHead)(struct gfOutput *out, FILE *f);
00167     /* Write file header if any */
00168 
00169     boolean reportTargetStrand; /* Report target as well as query strand? */
00170     struct hash *maskHash;      /* associates target sequence name and mask. */
00171     int minGood;                /* Minimum sequence identity in thousandths. */
00172     boolean qIsProt;            /* Query is peptide. */
00173     boolean tIsProt;            /* Target is peptide. */
00174     int queryIx;                /* Index of query */
00175     boolean includeTargetFile;  /* Prefix file: to target sequence name. */
00176     };
00177 
00178 struct gfOutput *gfOutputAny(char *format, 
00179         int goodPpt, boolean qIsProt, boolean tIsProt, 
00180         boolean noHead, char *databaseName,
00181         int databaseSeqCount, double databaseLetters,
00182         double minIdentity, FILE *f);
00183 /* Initialize output in a variety of formats in file or memory. 
00184  * Parameters:
00185  *    format - either 'psl', 'pslx', 'blast', 'wublast', 'axt'
00186  *    goodPpt - minimum identity of alignments to output in parts per thousand
00187  *    qIsProt - true if query side is a protein.
00188  *    tIsProt - true if target (database) side is a protein.
00189  *    noHead - if true suppress header in psl/pslx output.
00190  *    databaseName - name of database.  Only used for blast output
00191  *    databaseSeq - number of sequences in database - only for blast
00192  *    databaseLetters - number of bases/aas in database - only blast
00193  *    FILE *f - file.  
00194  */
00195 
00196 struct gfOutput *gfOutputPsl(int goodPpt, 
00197         boolean qIsProt, boolean tIsProt, FILE *f, 
00198         boolean saveSeq, boolean noHead);
00199 /* Set up psl/pslx output */
00200 
00201 struct gfOutput *gfOutputAxt(int goodPpt, boolean qIsProt, 
00202         boolean tIsProt, FILE *f);
00203 /* Setup output for axt format. */
00204 
00205 struct gfOutput *gfOutputAxtMem(int goodPpt, boolean qIsProt, 
00206         boolean tIsProt);
00207 /* Setup output for in memory axt output. */
00208 
00209 struct gfOutput *gfOutputBlast(int goodPpt, 
00210         boolean qIsProt, boolean tIsProt, 
00211         char *databaseName, int databaseSeqCount, double databaseLetters,
00212         char *blastType, /* blast, blast8, blast9, wublast, or xml */
00213         double minIdentity, FILE *f);
00214 /* Setup output for blast/wublast format. */
00215 
00216 void gfOutputQuery(struct gfOutput *out, FILE *f);
00217 /* Finish writing out results for a query to file. */
00218 
00219 void gfOutputHead(struct gfOutput *out, FILE *f);
00220 /* Write out header if any. */
00221 
00222 void gfOutputFree(struct gfOutput **pOut);
00223 /* Free up output. */
00224 
00225 /* -------- Routines to build up index ------------ */
00226 
00227 void gfCheckTileSize(int tileSize, boolean isPep);
00228 /* Check that tile size is legal.  Abort if not. */
00229 
00230 struct genoFind *gfIndexSeq(bioSeq *seqList,
00231         int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
00232         boolean isPep, boolean allowOneMismatch, boolean maskUpper,
00233         int stepSize);
00234 /* Make index for all seqs in list. 
00235  *      minMatch - minimum number of matching tiles to trigger alignments
00236  *      maxGap   - maximum deviation from diagonal of tiles
00237  *      tileSize - size of tile in nucleotides
00238  *      maxPat   - maximum use of tile to not be considered a repeat
00239  *      oocFile  - .ooc format file that lists repeat tiles.  May be NULL. 
00240  *      isPep    - TRUE if indexing proteins, FALSE for DNA. 
00241  *      maskUpper - Mask out upper case sequence (currently only for nucleotides).
00242  *      stepSize - space between tiles.  Zero means default (which is tileSize). 
00243  * For DNA sequences upper case bits will be unindexed. */
00244 
00245 struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[],
00246         int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 
00247         boolean allowOneMismatch, int stepSize);
00248 /* Make index for all .nib and .2bits in list. 
00249  *      minMatch - minimum number of matching tiles to trigger alignments
00250  *      maxGap   - maximum deviation from diagonal of tiles
00251  *      tileSize - size of tile in nucleotides
00252  *      maxPat   - maximum use of tile to not be considered a repeat
00253  *      oocFile  - .ooc format file that lists repeat tiles.  May be NULL. 
00254  *      allowOneMismatch - allow one mismatch in a tile.  
00255  *      stepSize - space between tiles.  Zero means default (which is tileSize). */
00256 
00257 void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], 
00258     int fileCount, char *fileNames[], 
00259     int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
00260     boolean allowOneMismatch, boolean mask, int stepSize);
00261 /* Make translated (6 frame) index for all .nib and .2bit files. */
00262 
00263 /* -------- Routines to scan index for homolgous areas ------------ */
00264 
00265 struct gfClump *gfFindClumps(struct genoFind *gf, struct dnaSeq *seq, 
00266         struct lm *lm, int *retHitCount);
00267 /* Find clumps associated with one sequence. */
00268 
00269 struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq, 
00270         Bits *qMaskBits, int qMaskOffset,
00271         struct lm *lm, int *retHitCount);
00272 /* Find clumps associated with one sequence soft-masking seq according to qMaskBits */
00273 
00274 struct gfHit *gfFindHitsInRegion(struct genoFind *gf, bioSeq *seq, 
00275         Bits *qMaskBits, int qMaskOffset, struct lm *lm, 
00276         struct gfSeqSource *target, int tMin, int tMax);
00277 /* Find hits restricted to one particular region. 
00278  * The hits returned by this will be in target sequence
00279  * coordinates rather than concatenated whole genome
00280  * coordinates as hits inside of clumps usually are.  */
00281 
00282 void gfTransFindClumps(struct genoFind *gfs[3], aaSeq *seq, struct gfClump *clumps[3], struct lm *lm, int *retHitCount);
00283 /* Find clumps associated with one sequence in three translated reading frames. */
00284 
00285 void gfTransTransFindClumps(struct genoFind *gfs[3], aaSeq *seqs[3], 
00286         struct gfClump *clumps[3][3], struct lm *lm, int *retHitCount);
00287 /* Find clumps associated with three sequences in three translated 
00288  * reading frames. Used for translated/translated protein comparisons. */
00289 
00290 void gfClumpDump(struct genoFind *gf, struct gfClump *clump, FILE *f);
00291 /* Print out info on clump.  This routine subtracts clump->target->start 
00292  * from clump->tStart and from clump->tEnd for printing, so that printed 
00293  * coords are relative to that target sequence. */
00294 
00295 
00296 void gfAlignAaClumps(struct genoFind *gf,  struct gfClump *clumpList, aaSeq *seq,
00297     boolean isRc,  int minMatch,  struct gfOutput *out);
00298 /* Convert gfClumps to an actual alignment that gets saved via 
00299  * outFunction/outData. */
00300 
00301 void gfFindAlignAaTrans(struct genoFind *gfs[3], aaSeq *qSeq, struct hash *t3Hash, 
00302         boolean tIsRc, int minMatch, struct gfOutput *out);
00303 /* Look for qSeq alignment in three translated reading frames. Save alignment
00304  * via outFunction/outData. */
00305 
00306 
00307 /* ---  Some routines for dealing with gfServer at a low level ---- */
00308 
00309 char *gfSignature();
00310 /* Return signature that starts each command to gfServer. Helps defend 
00311  * server from confused clients. */
00312 
00313 void gfCatchPipes();
00314 /* Set up to catch broken pipe signals. */
00315 
00316 int gfReadMulti(int sd, void *vBuf, size_t size);
00317 /* Read in until all is read or there is an error. */
00318 
00319 /* ---  Some routines for dealing with gfServer at a high level ---- */
00320 
00321 struct hash *gfFileCacheNew();
00322 /* Create hash for storing info on .nib and .2bit files. */
00323 
00324 void gfFileCacheFree(struct hash **pCache);
00325 /* Free up resources in cache. */
00326 
00327 void gfAlignStrand(int *pConn, char *nibDir, struct dnaSeq *seq,
00328     boolean isRc,  int minMatch, 
00329     struct hash *tFileCache, struct gfOutput *out);
00330 /* Search genome on server with one strand of other sequence to find homology. 
00331  * Then load homologous bits of genome locally and do detailed alignment.
00332  * Call 'outFunction' with each alignment that is found.  gfSavePsl is a handy
00333  * outFunction to use. */
00334 
00335 void gfAlignTrans(int *pConn, char *nibDir, aaSeq *seq,
00336     int minMatch, struct hash *tFileHash, struct gfOutput *out);
00337 /* Search indexed translated genome on server with an amino acid sequence. 
00338  * Then load homologous bits of genome locally and do detailed alignment.
00339  * Call 'outFunction' with each alignment that is found. */
00340 
00341 void gfAlignTransTrans(int *pConn, char *nibDir, struct dnaSeq *seq, 
00342         boolean qIsRc, int minMatch, struct hash *tFileCache, 
00343         struct gfOutput *out, boolean isRna);
00344 /* Search indexed translated genome on server with an dna sequence.  Translate
00345  * this sequence in three frames. Load homologous bits of genome locally
00346  * and do detailed alignment.  Call 'outFunction' with each alignment
00347  * that is found. */
00348 
00349 int gfConnect(char *hostName, char *portName);
00350 /* Set up our network connection to server. */
00351 
00352 void gfMakeOoc(char *outName, char *files[], int fileCount, 
00353         int tileSize, bits32 maxPat, enum gfType tType);
00354 /* Count occurences of tiles in seqList and make a .ooc file. */
00355 
00356 void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf, 
00357    boolean isRc, int minScore, Bits *qMaskBits, struct gfOutput *out,
00358    boolean fastMap, boolean band);
00359 /* Chop up query into pieces, align each, and stitch back
00360  * together again. */
00361 
00362 void gfLongTransTransInMem(struct dnaSeq *query, struct genoFind *gfs[3], 
00363    struct hash *t3Hash, boolean qIsRc, boolean tIsRc, boolean qIsRna,
00364    int minScore, struct gfOutput *out);
00365 /* Chop up query into pieces, align each in translated space, and stitch back
00366  * together again as nucleotides. */
00367 
00368 struct gfClump *gfPcrClumps(struct genoFind *gf, 
00369         char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
00370         int minDistance, int maxDistance);
00371 /* Find possible PCR hits.  The fPrimer and rPrimer are on opposite strands.
00372  * Note: unlike clumps from other query functions, PCR clumps from this 
00373  * function have already had clump->target->start subtracted from 
00374  * clump->tStart and clump->tEnd so that the coords are relative to that 
00375  * target sequence (not the collection of all target sequences). */
00376 
00377 #define gfVersion "34"  /* Current BLAT version number */
00378 
00379 #endif /* GENOFIND_H */
00380 

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2