jkOwnLib/patSpace.c

Go to the documentation of this file.
00001 /* patSpace - a homology finding algorithm that occurs mostly in 
00002  * pattern space (as opposed to offset space). */
00003 /* Copyright 1999-2003 Jim Kent.  All rights reserved. */
00004 #include "common.h"
00005 #include "portable.h"
00006 #include "dnaseq.h"
00007 #include "ooc.h"
00008 #include "patSpace.h"
00009 
00010 static char const rcsid[] = "$Id: patSpace.c,v 1.3 2006/03/15 18:36:16 angie Exp $";
00011 
00012 #define blockSize (256)
00013 
00014 #define BIGONE  /* define this for 32 bit version. */
00015 
00016 #ifdef BIGONE
00017 
00018 #define maxBlockCount (2*230*1024 - 1)
00019 #define psBits bits32
00020 /* psBits is the size of an index word.  If 16 bits
00021  * patSpace will use less memory, but be limited to
00022  * 16 meg or less genome size. */
00023 
00024 #else /* BIGONE */
00025 
00026 #define maxBlockCount (64*1024 - 1)
00027 #define psBits bits16
00028 /* psBits is the size of an index word.  If 16 bits
00029  * patSpace will use less memory, but be limited to
00030  * 16 meg or less genome size. */
00031 
00032 #endif /* BIGONE */
00033 
00034 
00035 #define maxPatCount (1024*16)    /* Maximum allowed count for one pattern. */
00036 
00037 struct blockPos
00038 /* Structure that stores where in a BAC we are - 
00039  * essentially an unpacked block index. */
00040     {
00041     bits16 bacIx;          /* Which bac. */
00042     bits16 seqIx;          /* Which subsequence in bac. */
00043     struct dnaSeq *seq; /* Pointer to subsequence. */
00044     int offset;         /* Start position of block within subsequence. */
00045     int size;           /* Size of block within subsequence. */
00046     };
00047 
00048 struct patSpace
00049 /* A pattern space - something that holds an index of all 10-mers in
00050  * genome. */
00051     {
00052     psBits **lists;                      /* A list for each 10-mer */
00053     psBits *listSizes;                   /* Size of list for each 10-mer */
00054     psBits *allocated;                   /* Storage space for all lists. */
00055     int    blocksUsed;                   /* Number of blocks used, <= maxBlockCount */
00056     int posBuf[maxBlockCount];           /* Histogram of hits to each block. */
00057     int hitBlocks[maxBlockCount];        /* Indexes of blocks with hits. */
00058     struct blockPos blockPos[maxBlockCount]; /* Relates block to sequence. */
00059     psBits maxPat;                       /* Max # of times pattern can occur
00060                                           * before it is ignored. */
00061     int minMatch;                        /* Minimum number of tile hits needed
00062                                           * to trigger a clump hit. */
00063     int maxGap;                          /* Max gap between tiles in a clump. */
00064     int seedSize;                        /* Size of alignment seed. */
00065     int seedSpaceSize;                    /* Number of possible seed values. */
00066     };
00067 
00068 void freePatSpace(struct patSpace **pPatSpace)
00069 /* Free up a pattern space. */
00070 {
00071 struct patSpace *ps = *pPatSpace;
00072 if (ps != NULL)
00073     {
00074     freeMem(ps->allocated);
00075     freez(pPatSpace);
00076     }
00077 }
00078 
00079 void tooManyBlocks()
00080 /* Complain about too many blocks and abort. */
00081 {
00082 errAbort("Too many blocks, can only handle %d\n", maxBlockCount);
00083 }
00084 
00085 static void fixBlockSize(struct blockPos *bp, int blockIx)
00086 /* Update size field from rest of blockPos. */
00087 {
00088 struct dnaSeq *seq = bp->seq;
00089 int size = seq->size - bp->offset;
00090 if (size > blockSize)
00091         size = blockSize;
00092 bp->size = size;
00093 }
00094 
00095 static struct patSpace *newPatSpace(int minMatch, int maxGap, int seedSize)
00096 /* Return an empty pattern space. */
00097 {
00098 struct patSpace *ps;
00099 int seedBitSize = seedSize*2;
00100 int seedSpaceSize;
00101 
00102 AllocVar(ps);
00103 ps->seedSize = seedSize;
00104 seedSpaceSize = ps->seedSpaceSize = (1<<seedBitSize);
00105 ps->lists = needLargeZeroedMem(seedSpaceSize * sizeof(ps->lists[0]));
00106 ps->listSizes = needLargeZeroedMem(seedSpaceSize * sizeof(ps->listSizes[0]));
00107 ps->minMatch = minMatch;
00108 ps->maxGap = maxGap;
00109 return ps;
00110 }
00111 
00112 static void countPatSpace(struct patSpace *ps, struct dnaSeq *seq)
00113 /* Add up number of times each pattern occurs in sequence into ps->listSizes. */
00114 {
00115 int size = seq->size;
00116 int mask = ps->seedSpaceSize-1;
00117 DNA *dna = seq->dna;
00118 int i;
00119 int bits = 0;
00120 int bVal;
00121 int ls;
00122 
00123 for (i=0; i<ps->seedSize-1; ++i)
00124     {
00125     bVal = ntValNoN[(int)dna[i]];
00126     bits <<= 2;
00127     bits += bVal;
00128     }
00129 for (i=ps->seedSize-1; i<size; ++i)
00130     {
00131     bVal = ntValNoN[(int)dna[i]];
00132     bits <<= 2;
00133     bits += bVal;
00134     bits &= mask;
00135     ls = ps->listSizes[bits];
00136     if (ls < maxPatCount)
00137         ps->listSizes[bits] = ls+1;
00138     }
00139 }
00140 
00141 static int allocPatSpaceLists(struct patSpace *ps)
00142 /* Allocate pat space lists and set up list pointers. 
00143  * Returns size of all lists. */
00144 {
00145 int oneCount;
00146 int count = 0;
00147 int i;
00148 psBits *listSizes = ps->listSizes;
00149 psBits **lists = ps->lists;
00150 psBits *allocated;
00151 psBits maxPat = ps->maxPat;
00152 int size;
00153 int usedCount = 0, overusedCount = 0;
00154 int seedSpaceSize = ps->seedSpaceSize;
00155 
00156 for (i=0; i<seedSpaceSize; ++i)
00157     {
00158     /* If pattern is too much used it's no good to us, ignore. */
00159     if ((oneCount = listSizes[i]) < maxPat)
00160         {
00161         count += oneCount;
00162         usedCount += 1;
00163         }
00164     else
00165         {
00166         overusedCount += 1;
00167         }
00168     }
00169 ps->allocated = allocated = needLargeMem(count*sizeof(allocated[0]));
00170 for (i=0; i<seedSpaceSize; ++i)
00171     {
00172     if ((size = listSizes[i]) < maxPat)
00173         {
00174         lists[i] = allocated;
00175         allocated += size;
00176         }
00177     }
00178 return count;
00179 }
00180 
00181 static int addToPatSpace(struct patSpace *ps, int bacIx, int seqIx, struct dnaSeq *seq,int startBlock)
00182 /* Add contents of one sequence to pattern space. Returns end block. */
00183 {
00184 int size = seq->size;
00185 int mask = ps->seedSpaceSize-1;
00186 DNA *dna = seq->dna;
00187 int i;
00188 int bits = 0;
00189 int bVal;
00190 int blockMod = blockSize;
00191 int curCount;
00192 psBits maxPat = ps->maxPat;
00193 psBits *listSizes = ps->listSizes;
00194 psBits **lists = ps->lists;
00195 struct blockPos *bp = &ps->blockPos[startBlock];
00196 
00197 bp->bacIx = bacIx;
00198 bp->seqIx = seqIx;
00199 bp->seq = seq;
00200 bp->offset = 0;
00201 fixBlockSize(bp, startBlock);
00202 ++bp;
00203 for (i=0; i<ps->seedSize-1; ++i)
00204     {
00205     bVal = ntValNoN[(int)dna[i]];
00206     bits <<= 2;
00207     bits += bVal;
00208     }
00209 for (i=ps->seedSize-1; i<size; ++i)
00210     {
00211     bVal = ntValNoN[(int)dna[i]];
00212     bits <<= 2;
00213     bits += bVal;
00214     bits &= mask;
00215     if ((curCount = listSizes[bits]) < maxPat)
00216         {
00217         listSizes[bits] = curCount+1;
00218         lists[bits][curCount] = startBlock;
00219         }
00220     if (--blockMod == 0)
00221         {
00222         if (++startBlock >= maxBlockCount)
00223             tooManyBlocks();
00224         blockMod = blockSize;
00225         bp->bacIx = bacIx;
00226         bp->seqIx = seqIx;
00227         bp->seq = seq;
00228         bp->offset = i - (ps->seedSize-1) + 1;
00229         fixBlockSize(bp, startBlock);
00230         ++bp;
00231         }
00232     }
00233 for (i=0; i<ps->seedSize-1; ++i)
00234     {
00235     if (--blockMod == 0)
00236         {
00237         if (++startBlock >= maxBlockCount)
00238             tooManyBlocks();
00239         blockMod = blockSize;
00240         }
00241     }
00242 if (blockMod != blockSize)
00243     ++startBlock;
00244 return startBlock;
00245 }
00246 
00247 struct patSpace *makePatSpace(
00248     struct dnaSeq **seqArray,       /* Array of sequence lists. */
00249     int seqArrayCount,              /* Size of above array. */
00250     int seedSize,                   /* Alignment seed size - 10 or 11. Must match oocFileName */
00251     char *oocFileName,              /* File with tiles to filter out, may be NULL. */
00252     int minMatch,                   /* Minimum # of matching tiles.  4 is good. */
00253     int maxGap                      /* Maximum gap size - 32k is good for 
00254                                        cDNA (introns), 500 is good for DNA assembly. */
00255     )
00256 /* Allocate a pattern space and fill from sequences.  (Each element of
00257    seqArray is a list of sequences. */
00258 {
00259 struct patSpace *ps = newPatSpace(minMatch, maxGap,seedSize);
00260 int i;
00261 int startIx = 0;
00262 int total = 0;
00263 struct dnaSeq *seq;
00264 psBits maxPat;
00265 psBits *listSizes;
00266 int seedSpaceSize = ps->seedSpaceSize;
00267 
00268 maxPat = ps->maxPat = maxPatCount;
00269 for (i=0; i<seqArrayCount; ++i)
00270     {
00271     for (seq = seqArray[i]; seq != NULL; seq = seq->next)
00272         {
00273         total += seq->size;
00274         countPatSpace(ps, seq);
00275         }
00276     }
00277 
00278 listSizes = ps->listSizes;
00279 
00280 /* Scan through over-popular patterns and set their count to value 
00281  * where they won't be added to pat space. */
00282 oocMaskCounts(oocFileName, listSizes, ps->seedSize, maxPat);
00283 
00284 /* Get rid of simple repeats as well. */
00285 oocMaskSimpleRepeats(listSizes, ps->seedSize, maxPat);
00286 
00287 
00288 allocPatSpaceLists(ps);
00289 
00290 /* Zero out pattern counts that aren't oversubscribed. */
00291 for (i=0; i<ps->seedSpaceSize; ++i)
00292     {
00293     if (listSizes[i] < maxPat)
00294         listSizes[i] = 0;
00295     }
00296 for (i=0; i<seqArrayCount; ++i)
00297     {
00298         int j;
00299     for (seq = seqArray[i], j=0; seq != NULL; seq = seq->next, ++j)
00300         {
00301         startIx = addToPatSpace(ps, i, j, seq, startIx);
00302         if (startIx >= maxBlockCount)
00303             tooManyBlocks();
00304         }
00305     }
00306 ps->blocksUsed = startIx;
00307 
00308 /* Zero local over-popular patterns. */
00309 for (i=0; i<seedSpaceSize; ++i)
00310     {
00311     if (listSizes[i] >= maxPat)
00312         listSizes[i] = 0;
00313     }
00314 
00315 return ps;
00316 }
00317 
00318 
00319 void bfFind(struct patSpace *ps, int block, struct blockPos *ret)
00320 /* Find dna sequence and position within sequence that corresponds to block. */
00321 {
00322 assert(block >= 0 && block < maxBlockCount);
00323 *ret = ps->blockPos[block];
00324 }
00325 
00326 
00327 static struct patClump *newClump(struct patSpace *ps, struct blockPos *first, struct blockPos *last)
00328 /* Make new clump . */
00329 {
00330 struct dnaSeq *seq = first->seq;
00331 int seqIx = first->seqIx;
00332 int bacIx = first->bacIx;
00333 int start = first->offset;
00334 int end = last->offset+last->size;
00335 int extraAtEnds = blockSize/2;
00336 struct patClump *cl;
00337 
00338 start -= extraAtEnds;
00339 if (start < 0)
00340     start = 0;
00341 end += extraAtEnds;
00342 if (end >seq->size)
00343     end = seq->size;
00344 AllocVar(cl);
00345 cl->bacIx = bacIx;
00346 cl->seqIx = seqIx;
00347 cl->seq = seq;
00348 cl->start = start;
00349 cl->size = end-start;
00350 return cl;
00351 }
00352 
00353 
00354 static struct patClump *clumpHits(struct patSpace *ps,
00355     int hitBlocks[], int hitCount, int posBuf[], 
00356     DNA *dna, int dnaSize)
00357 /* Clumps together hits and returns a list. */
00358 {
00359 /* Clump together hits. */
00360 int block;
00361 int i;
00362 int maxGap = ps->maxGap;
00363 struct blockPos first, cur, pre;
00364 struct patClump *patClump = NULL, *cl;
00365 
00366 bfFind(ps, hitBlocks[0], &first);
00367 pre = first;
00368 for (i=1; i<hitCount; ++i)
00369     {
00370     block = hitBlocks[i];
00371     bfFind(ps, block, &cur);
00372     if (cur.seq != pre.seq || cur.offset - (pre.offset + pre.size) > maxGap)
00373         {
00374         /* Write old clump and start new one. */
00375         cl = newClump(ps, &first, &pre);
00376         slAddHead(&patClump, cl);
00377         first = cur;
00378         }
00379     else
00380         {
00381         /* Extend old clump. */
00382         }
00383     pre = cur;
00384     }
00385 /* Write hitOut last clump. */
00386 cl = newClump(ps, &first, &pre);
00387 slAddHead(&patClump, cl);
00388 slReverse(&patClump);
00389 return patClump;
00390 }
00391 
00392 struct patClump *patSpaceFindOne(struct patSpace *ps, DNA *dna, int dnaSize)
00393 /* Find occurrences of DNA in patSpace. */
00394 {
00395 int lastStart = dnaSize - ps->seedSize;
00396 int i,j;
00397 int pat;
00398 int hitBlockCount = 0;
00399 int totalSigHits = 0;
00400 DNA *tile = dna;
00401 int blocksUsed = ps->blocksUsed;
00402 int *posBuf = ps->posBuf;
00403 int *hitBlocks = ps->hitBlocks;
00404 int minMatch = ps->minMatch;
00405 
00406 memset(ps->posBuf, 0, sizeof(ps->posBuf[0]) * blocksUsed);
00407 for (i=0; i<=lastStart; i += ps->seedSize)
00408     {
00409     psBits *list;
00410     psBits count;
00411 
00412     pat = 0;
00413     for (j=0; j<ps->seedSize; ++j)
00414         {
00415         int bVal = ntValNoN[(int)tile[j]];
00416         pat <<= 2;
00417         pat += bVal;
00418         }
00419     list = ps->lists[pat];
00420     if ((count = ps->listSizes[pat]) > 0)
00421         {
00422         for (j=0; j<count; ++j)
00423             posBuf[list[j]] += 1;            
00424         }
00425     tile += ps->seedSize;
00426     }
00427 
00428 /* Scan through array that records counts of hits at positions. */
00429 for (i=0; i<blocksUsed-1; ++i)
00430     {
00431     /* Save significant hits in a more compact array */
00432     int a = posBuf[i], b = posBuf[i+1];
00433     int sum = a + b;
00434     if (sum >= minMatch)
00435         {
00436         if (a > 0)
00437             {
00438             if (hitBlockCount == 0 || hitBlocks[hitBlockCount-1] != i)
00439                 {
00440                 hitBlocks[hitBlockCount++] = i;
00441                 totalSigHits += a;
00442                 }
00443             }
00444         if (b > 0)
00445             {
00446             hitBlocks[hitBlockCount++] = i+1;
00447             totalSigHits += b;
00448             }
00449         }
00450     }
00451 
00452 /* Output data with significant hits. */
00453 if (hitBlockCount > 0 && totalSigHits*ps->seedSize*8 > dnaSize)
00454     {
00455     return clumpHits(ps, hitBlocks, hitBlockCount, posBuf, dna, 
00456         dnaSize);
00457     }        
00458 else
00459     return NULL;
00460 }
00461 
00462 
00463 
00464 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2