00001 /* genoFind.h - Interface to modules for fast finding of sequence 00002 * matches. */ 00003 /* Copyright 2001-2002 Jim Kent. All rights reserved. */ 00004 00005 #ifndef GENOFIND_H 00006 #define GENOFIND_H 00007 00008 #ifndef DNASEQ_H 00009 #include "dnaseq.h" 00010 #endif 00011 00012 #ifndef FUZZYFIND_H 00013 #include "fuzzyFind.h" 00014 #endif 00015 00016 #ifndef HASH_H 00017 #include "hash.h" 00018 #endif 00019 00020 #ifndef ALITYPE_H 00021 #include "aliType.h" 00022 #endif 00023 00024 #ifndef LOCALMEM_H 00025 #include "localmem.h" 00026 #endif 00027 00028 #ifndef BITS_H 00029 #include "bits.h" 00030 #endif 00031 00032 #ifndef AXT_H 00033 #include "axt.h" 00034 #endif 00035 00036 enum gfConstants { 00037 gfMinMatch = 2, 00038 gfMaxGap = 2, 00039 gfTileSize = 11, 00040 gfMaxTileUse = 1024, 00041 gfPepMaxTileUse = 30000, 00042 }; 00043 00044 struct gfSeqSource 00045 /* Where a block of sequence comes from. */ 00046 { 00047 struct gfSeqSource *next; 00048 char *fileName; /* Name of file. */ 00049 bioSeq *seq; /* Sequences. Usually either this or fileName is NULL. */ 00050 bits32 start,end; /* Position within merged sequence. */ 00051 Bits *maskedBits; /* If non-null contains repeat-masking info. */ 00052 }; 00053 00054 struct gfHit 00055 /* A genoFind hit. */ 00056 { 00057 struct gfHit *next; 00058 bits32 qStart; /* Where it hits in query. */ 00059 bits32 tStart; /* Where it hits in target. */ 00060 bits32 diagonal; /* tStart + qSize - qStart. */ 00061 }; 00062 00063 /* gfHits are free'd with simple freeMem or slFreeList. */ 00064 00065 struct gfClump 00066 /* A clump of hits. */ 00067 /* Note: for clumps from regular (blat) queries, tStart and tEnd include 00068 * target->start, but for clumps from gfPcrClumps(), tStart and tEnd have 00069 * already had target->start subtracted. So tStart and tEnd in PCR clumps 00070 * are relative to that target sequence (not the collection of all target 00071 * sequences). */ 00072 { 00073 struct gfClump *next; /* Next clump. */ 00074 bits32 qStart, qEnd; /* Position in query. */ 00075 struct gfSeqSource *target; /* Target source sequence. */ 00076 bits32 tStart, tEnd; /* Position in target. */ 00077 int hitCount; /* Number of hits. */ 00078 struct gfHit *hitList; /* List of hits. Not allocated here. */ 00079 int queryCoverage; /* Number of bases covered in query (thx AG!) */ 00080 }; 00081 00082 void gfClumpFree(struct gfClump **pClump); 00083 /* Free a single clump. */ 00084 00085 void gfClumpFreeList(struct gfClump **pList); 00086 /* Free a list of dynamically allocated gfClump's */ 00087 00088 struct genoFind 00089 /* An index of all K-mers in the genome. */ 00090 { 00091 int maxPat; /* Max # of times pattern can occur 00092 * before it is ignored. */ 00093 int minMatch; /* Minimum number of tile hits needed 00094 * to trigger a clump hit. */ 00095 int maxGap; /* Max gap between tiles in a clump. */ 00096 int tileSize; /* Size of each N-mer. */ 00097 int stepSize; /* Spacing between N-mers. */ 00098 int tileSpaceSize; /* Number of N-mer values. */ 00099 int tileMask; /* 1-s for each N-mer. */ 00100 int sourceCount; /* Count of source files. */ 00101 struct gfSeqSource *sources; /* List of sequence sources. */ 00102 bool isPep; /* Is a peptide. */ 00103 bool allowOneMismatch; /* Allow a single mismatch? */ 00104 int segSize; /* Index is segmented if non-zero. */ 00105 bits32 totalSeqSize; /* Total size of all sequences. */ 00106 bits32 *listSizes; /* Size of list for each N-mer */ 00107 void *allocated; /* Storage space for all lists. */ 00108 bits32 **lists; /* A list for each N-mer. Used if 00109 * isSegmented is false. */ 00110 bits16 **endLists; /* A more complex list for each N-mer. 00111 * Used if isSegmented is true. 00112 * Values come in groups of threes. 00113 * The first is the packed last few 00114 * letters of the tile. The next two 00115 * are the offset in the genome. This 00116 * would be a struct but that would take 00117 * 8 bytes instead of 6, or nearly an 00118 * extra gigabyte of RAM. */ 00119 }; 00120 00121 void genoFindFree(struct genoFind **pGenoFind); 00122 /* Free up a genoFind index. */ 00123 00124 struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name); 00125 /* Find target of given name. Return NULL if none. */ 00126 00127 /* --- Stuff for saving results ---- */ 00128 00129 00130 struct gfOutput 00131 /* A polymorphic object to help us write many file types. */ 00132 { 00133 struct gfOutput *next; 00134 void *data; /* Type-specific data pointer. Must be freeMem'able */ 00135 void (*out)(char *chromName, int chromSize, int chromOffset, 00136 struct ffAli *ali, bioSeq *tSeq, struct hash *t3Hash, bioSeq *qSeq, 00137 boolean qIsRc, boolean tIsRc, 00138 enum ffStringency stringency, int minMatch, struct gfOutput *out); 00139 /* This is the type of a client provided function to save an alignment. 00140 * The parameters are: 00141 * chromName - name of target (aka genomic or database) sequence. 00142 * chromSize - size of target sequence. 00143 * chromOffset - offset of genoSequence in target. 00144 * ffAli - alignment with pointers into tSeq/qSeq or in 00145 * translated target case, into t3Hash. 00146 * tSeq - part of target sequence in normal case. In translated 00147 * target case look at t3Hash instead. 00148 * t3Hash - used only in translated target case. A hash keyed by 00149 * target sequence name with values *lists* of trans3 structures. 00150 * This hash can be searched to find both the translated and 00151 * untranslated versions of the bits of the target that are in 00152 * memory. (You can assume at this point all parts needed for 00153 * output are indeed in memory.) 00154 * qSeq - query sequence (this isn't segmented at all). 00155 * isRc - True if query is reverse complemented. 00156 * stringency - ffCdna, etc. I'm hoping to move this elsewhere. 00157 * minMatch - minimum score to output. Also should be moved elsewhere. 00158 * outputData - custom data for specific output function. 00159 * The interface is a bit complex - partly from the demands of translated 00160 * output, and partly from trying not to have the entire target sequence in 00161 * memory. 00162 */ 00163 void (*queryOut)(struct gfOutput *out, FILE *f); 00164 /* Called for each query */ 00165 00166 void (*fileHead)(struct gfOutput *out, FILE *f); 00167 /* Write file header if any */ 00168 00169 boolean reportTargetStrand; /* Report target as well as query strand? */ 00170 struct hash *maskHash; /* associates target sequence name and mask. */ 00171 int minGood; /* Minimum sequence identity in thousandths. */ 00172 boolean qIsProt; /* Query is peptide. */ 00173 boolean tIsProt; /* Target is peptide. */ 00174 int queryIx; /* Index of query */ 00175 boolean includeTargetFile; /* Prefix file: to target sequence name. */ 00176 }; 00177 00178 struct gfOutput *gfOutputAny(char *format, 00179 int goodPpt, boolean qIsProt, boolean tIsProt, 00180 boolean noHead, char *databaseName, 00181 int databaseSeqCount, double databaseLetters, 00182 double minIdentity, FILE *f); 00183 /* Initialize output in a variety of formats in file or memory. 00184 * Parameters: 00185 * format - either 'psl', 'pslx', 'blast', 'wublast', 'axt' 00186 * goodPpt - minimum identity of alignments to output in parts per thousand 00187 * qIsProt - true if query side is a protein. 00188 * tIsProt - true if target (database) side is a protein. 00189 * noHead - if true suppress header in psl/pslx output. 00190 * databaseName - name of database. Only used for blast output 00191 * databaseSeq - number of sequences in database - only for blast 00192 * databaseLetters - number of bases/aas in database - only blast 00193 * FILE *f - file. 00194 */ 00195 00196 struct gfOutput *gfOutputPsl(int goodPpt, 00197 boolean qIsProt, boolean tIsProt, FILE *f, 00198 boolean saveSeq, boolean noHead); 00199 /* Set up psl/pslx output */ 00200 00201 struct gfOutput *gfOutputAxt(int goodPpt, boolean qIsProt, 00202 boolean tIsProt, FILE *f); 00203 /* Setup output for axt format. */ 00204 00205 struct gfOutput *gfOutputAxtMem(int goodPpt, boolean qIsProt, 00206 boolean tIsProt); 00207 /* Setup output for in memory axt output. */ 00208 00209 struct gfOutput *gfOutputBlast(int goodPpt, 00210 boolean qIsProt, boolean tIsProt, 00211 char *databaseName, int databaseSeqCount, double databaseLetters, 00212 char *blastType, /* blast, blast8, blast9, wublast, or xml */ 00213 double minIdentity, FILE *f); 00214 /* Setup output for blast/wublast format. */ 00215 00216 void gfOutputQuery(struct gfOutput *out, FILE *f); 00217 /* Finish writing out results for a query to file. */ 00218 00219 void gfOutputHead(struct gfOutput *out, FILE *f); 00220 /* Write out header if any. */ 00221 00222 void gfOutputFree(struct gfOutput **pOut); 00223 /* Free up output. */ 00224 00225 /* -------- Routines to build up index ------------ */ 00226 00227 void gfCheckTileSize(int tileSize, boolean isPep); 00228 /* Check that tile size is legal. Abort if not. */ 00229 00230 struct genoFind *gfIndexSeq(bioSeq *seqList, 00231 int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 00232 boolean isPep, boolean allowOneMismatch, boolean maskUpper, 00233 int stepSize); 00234 /* Make index for all seqs in list. 00235 * minMatch - minimum number of matching tiles to trigger alignments 00236 * maxGap - maximum deviation from diagonal of tiles 00237 * tileSize - size of tile in nucleotides 00238 * maxPat - maximum use of tile to not be considered a repeat 00239 * oocFile - .ooc format file that lists repeat tiles. May be NULL. 00240 * isPep - TRUE if indexing proteins, FALSE for DNA. 00241 * maskUpper - Mask out upper case sequence (currently only for nucleotides). 00242 * stepSize - space between tiles. Zero means default (which is tileSize). 00243 * For DNA sequences upper case bits will be unindexed. */ 00244 00245 struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[], 00246 int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 00247 boolean allowOneMismatch, int stepSize); 00248 /* Make index for all .nib and .2bits in list. 00249 * minMatch - minimum number of matching tiles to trigger alignments 00250 * maxGap - maximum deviation from diagonal of tiles 00251 * tileSize - size of tile in nucleotides 00252 * maxPat - maximum use of tile to not be considered a repeat 00253 * oocFile - .ooc format file that lists repeat tiles. May be NULL. 00254 * allowOneMismatch - allow one mismatch in a tile. 00255 * stepSize - space between tiles. Zero means default (which is tileSize). */ 00256 00257 void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], 00258 int fileCount, char *fileNames[], 00259 int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 00260 boolean allowOneMismatch, boolean mask, int stepSize); 00261 /* Make translated (6 frame) index for all .nib and .2bit files. */ 00262 00263 /* -------- Routines to scan index for homolgous areas ------------ */ 00264 00265 struct gfClump *gfFindClumps(struct genoFind *gf, struct dnaSeq *seq, 00266 struct lm *lm, int *retHitCount); 00267 /* Find clumps associated with one sequence. */ 00268 00269 struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq, 00270 Bits *qMaskBits, int qMaskOffset, 00271 struct lm *lm, int *retHitCount); 00272 /* Find clumps associated with one sequence soft-masking seq according to qMaskBits */ 00273 00274 struct gfHit *gfFindHitsInRegion(struct genoFind *gf, bioSeq *seq, 00275 Bits *qMaskBits, int qMaskOffset, struct lm *lm, 00276 struct gfSeqSource *target, int tMin, int tMax); 00277 /* Find hits restricted to one particular region. 00278 * The hits returned by this will be in target sequence 00279 * coordinates rather than concatenated whole genome 00280 * coordinates as hits inside of clumps usually are. */ 00281 00282 void gfTransFindClumps(struct genoFind *gfs[3], aaSeq *seq, struct gfClump *clumps[3], struct lm *lm, int *retHitCount); 00283 /* Find clumps associated with one sequence in three translated reading frames. */ 00284 00285 void gfTransTransFindClumps(struct genoFind *gfs[3], aaSeq *seqs[3], 00286 struct gfClump *clumps[3][3], struct lm *lm, int *retHitCount); 00287 /* Find clumps associated with three sequences in three translated 00288 * reading frames. Used for translated/translated protein comparisons. */ 00289 00290 void gfClumpDump(struct genoFind *gf, struct gfClump *clump, FILE *f); 00291 /* Print out info on clump. This routine subtracts clump->target->start 00292 * from clump->tStart and from clump->tEnd for printing, so that printed 00293 * coords are relative to that target sequence. */ 00294 00295 00296 void gfAlignAaClumps(struct genoFind *gf, struct gfClump *clumpList, aaSeq *seq, 00297 boolean isRc, int minMatch, struct gfOutput *out); 00298 /* Convert gfClumps to an actual alignment that gets saved via 00299 * outFunction/outData. */ 00300 00301 void gfFindAlignAaTrans(struct genoFind *gfs[3], aaSeq *qSeq, struct hash *t3Hash, 00302 boolean tIsRc, int minMatch, struct gfOutput *out); 00303 /* Look for qSeq alignment in three translated reading frames. Save alignment 00304 * via outFunction/outData. */ 00305 00306 00307 /* --- Some routines for dealing with gfServer at a low level ---- */ 00308 00309 char *gfSignature(); 00310 /* Return signature that starts each command to gfServer. Helps defend 00311 * server from confused clients. */ 00312 00313 void gfCatchPipes(); 00314 /* Set up to catch broken pipe signals. */ 00315 00316 int gfReadMulti(int sd, void *vBuf, size_t size); 00317 /* Read in until all is read or there is an error. */ 00318 00319 /* --- Some routines for dealing with gfServer at a high level ---- */ 00320 00321 struct hash *gfFileCacheNew(); 00322 /* Create hash for storing info on .nib and .2bit files. */ 00323 00324 void gfFileCacheFree(struct hash **pCache); 00325 /* Free up resources in cache. */ 00326 00327 void gfAlignStrand(int *pConn, char *nibDir, struct dnaSeq *seq, 00328 boolean isRc, int minMatch, 00329 struct hash *tFileCache, struct gfOutput *out); 00330 /* Search genome on server with one strand of other sequence to find homology. 00331 * Then load homologous bits of genome locally and do detailed alignment. 00332 * Call 'outFunction' with each alignment that is found. gfSavePsl is a handy 00333 * outFunction to use. */ 00334 00335 void gfAlignTrans(int *pConn, char *nibDir, aaSeq *seq, 00336 int minMatch, struct hash *tFileHash, struct gfOutput *out); 00337 /* Search indexed translated genome on server with an amino acid sequence. 00338 * Then load homologous bits of genome locally and do detailed alignment. 00339 * Call 'outFunction' with each alignment that is found. */ 00340 00341 void gfAlignTransTrans(int *pConn, char *nibDir, struct dnaSeq *seq, 00342 boolean qIsRc, int minMatch, struct hash *tFileCache, 00343 struct gfOutput *out, boolean isRna); 00344 /* Search indexed translated genome on server with an dna sequence. Translate 00345 * this sequence in three frames. Load homologous bits of genome locally 00346 * and do detailed alignment. Call 'outFunction' with each alignment 00347 * that is found. */ 00348 00349 int gfConnect(char *hostName, char *portName); 00350 /* Set up our network connection to server. */ 00351 00352 void gfMakeOoc(char *outName, char *files[], int fileCount, 00353 int tileSize, bits32 maxPat, enum gfType tType); 00354 /* Count occurences of tiles in seqList and make a .ooc file. */ 00355 00356 void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf, 00357 boolean isRc, int minScore, Bits *qMaskBits, struct gfOutput *out, 00358 boolean fastMap, boolean band); 00359 /* Chop up query into pieces, align each, and stitch back 00360 * together again. */ 00361 00362 void gfLongTransTransInMem(struct dnaSeq *query, struct genoFind *gfs[3], 00363 struct hash *t3Hash, boolean qIsRc, boolean tIsRc, boolean qIsRna, 00364 int minScore, struct gfOutput *out); 00365 /* Chop up query into pieces, align each in translated space, and stitch back 00366 * together again as nucleotides. */ 00367 00368 struct gfClump *gfPcrClumps(struct genoFind *gf, 00369 char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, 00370 int minDistance, int maxDistance); 00371 /* Find possible PCR hits. The fPrimer and rPrimer are on opposite strands. 00372 * Note: unlike clumps from other query functions, PCR clumps from this 00373 * function have already had clump->target->start subtracted from 00374 * clump->tStart and clump->tEnd so that the coords are relative to that 00375 * target sequence (not the collection of all target sequences). */ 00376 00377 #define gfVersion "34" /* Current BLAT version number */ 00378 00379 #endif /* GENOFIND_H */ 00380
1.5.2