00001
00002
00003
00004 #include "common.h"
00005 #include "portable.h"
00006 #include "dnaseq.h"
00007 #include "ooc.h"
00008 #include "patSpace.h"
00009
00010 static char const rcsid[] = "$Id: patSpace.c,v 1.3 2006/03/15 18:36:16 angie Exp $";
00011
00012 #define blockSize (256)
00013
00014 #define BIGONE
00015
00016 #ifdef BIGONE
00017
00018 #define maxBlockCount (2*230*1024 - 1)
00019 #define psBits bits32
00020
00021
00022
00023
00024 #else
00025
00026 #define maxBlockCount (64*1024 - 1)
00027 #define psBits bits16
00028
00029
00030
00031
00032 #endif
00033
00034
00035 #define maxPatCount (1024*16)
00036
00037 struct blockPos
00038
00039
00040 {
00041 bits16 bacIx;
00042 bits16 seqIx;
00043 struct dnaSeq *seq;
00044 int offset;
00045 int size;
00046 };
00047
00048 struct patSpace
00049
00050
00051 {
00052 psBits **lists;
00053 psBits *listSizes;
00054 psBits *allocated;
00055 int blocksUsed;
00056 int posBuf[maxBlockCount];
00057 int hitBlocks[maxBlockCount];
00058 struct blockPos blockPos[maxBlockCount];
00059 psBits maxPat;
00060
00061 int minMatch;
00062
00063 int maxGap;
00064 int seedSize;
00065 int seedSpaceSize;
00066 };
00067
00068 void freePatSpace(struct patSpace **pPatSpace)
00069
00070 {
00071 struct patSpace *ps = *pPatSpace;
00072 if (ps != NULL)
00073 {
00074 freeMem(ps->allocated);
00075 freez(pPatSpace);
00076 }
00077 }
00078
00079 void tooManyBlocks()
00080
00081 {
00082 errAbort("Too many blocks, can only handle %d\n", maxBlockCount);
00083 }
00084
00085 static void fixBlockSize(struct blockPos *bp, int blockIx)
00086
00087 {
00088 struct dnaSeq *seq = bp->seq;
00089 int size = seq->size - bp->offset;
00090 if (size > blockSize)
00091 size = blockSize;
00092 bp->size = size;
00093 }
00094
00095 static struct patSpace *newPatSpace(int minMatch, int maxGap, int seedSize)
00096
00097 {
00098 struct patSpace *ps;
00099 int seedBitSize = seedSize*2;
00100 int seedSpaceSize;
00101
00102 AllocVar(ps);
00103 ps->seedSize = seedSize;
00104 seedSpaceSize = ps->seedSpaceSize = (1<<seedBitSize);
00105 ps->lists = needLargeZeroedMem(seedSpaceSize * sizeof(ps->lists[0]));
00106 ps->listSizes = needLargeZeroedMem(seedSpaceSize * sizeof(ps->listSizes[0]));
00107 ps->minMatch = minMatch;
00108 ps->maxGap = maxGap;
00109 return ps;
00110 }
00111
00112 static void countPatSpace(struct patSpace *ps, struct dnaSeq *seq)
00113
00114 {
00115 int size = seq->size;
00116 int mask = ps->seedSpaceSize-1;
00117 DNA *dna = seq->dna;
00118 int i;
00119 int bits = 0;
00120 int bVal;
00121 int ls;
00122
00123 for (i=0; i<ps->seedSize-1; ++i)
00124 {
00125 bVal = ntValNoN[(int)dna[i]];
00126 bits <<= 2;
00127 bits += bVal;
00128 }
00129 for (i=ps->seedSize-1; i<size; ++i)
00130 {
00131 bVal = ntValNoN[(int)dna[i]];
00132 bits <<= 2;
00133 bits += bVal;
00134 bits &= mask;
00135 ls = ps->listSizes[bits];
00136 if (ls < maxPatCount)
00137 ps->listSizes[bits] = ls+1;
00138 }
00139 }
00140
00141 static int allocPatSpaceLists(struct patSpace *ps)
00142
00143
00144 {
00145 int oneCount;
00146 int count = 0;
00147 int i;
00148 psBits *listSizes = ps->listSizes;
00149 psBits **lists = ps->lists;
00150 psBits *allocated;
00151 psBits maxPat = ps->maxPat;
00152 int size;
00153 int usedCount = 0, overusedCount = 0;
00154 int seedSpaceSize = ps->seedSpaceSize;
00155
00156 for (i=0; i<seedSpaceSize; ++i)
00157 {
00158
00159 if ((oneCount = listSizes[i]) < maxPat)
00160 {
00161 count += oneCount;
00162 usedCount += 1;
00163 }
00164 else
00165 {
00166 overusedCount += 1;
00167 }
00168 }
00169 ps->allocated = allocated = needLargeMem(count*sizeof(allocated[0]));
00170 for (i=0; i<seedSpaceSize; ++i)
00171 {
00172 if ((size = listSizes[i]) < maxPat)
00173 {
00174 lists[i] = allocated;
00175 allocated += size;
00176 }
00177 }
00178 return count;
00179 }
00180
00181 static int addToPatSpace(struct patSpace *ps, int bacIx, int seqIx, struct dnaSeq *seq,int startBlock)
00182
00183 {
00184 int size = seq->size;
00185 int mask = ps->seedSpaceSize-1;
00186 DNA *dna = seq->dna;
00187 int i;
00188 int bits = 0;
00189 int bVal;
00190 int blockMod = blockSize;
00191 int curCount;
00192 psBits maxPat = ps->maxPat;
00193 psBits *listSizes = ps->listSizes;
00194 psBits **lists = ps->lists;
00195 struct blockPos *bp = &ps->blockPos[startBlock];
00196
00197 bp->bacIx = bacIx;
00198 bp->seqIx = seqIx;
00199 bp->seq = seq;
00200 bp->offset = 0;
00201 fixBlockSize(bp, startBlock);
00202 ++bp;
00203 for (i=0; i<ps->seedSize-1; ++i)
00204 {
00205 bVal = ntValNoN[(int)dna[i]];
00206 bits <<= 2;
00207 bits += bVal;
00208 }
00209 for (i=ps->seedSize-1; i<size; ++i)
00210 {
00211 bVal = ntValNoN[(int)dna[i]];
00212 bits <<= 2;
00213 bits += bVal;
00214 bits &= mask;
00215 if ((curCount = listSizes[bits]) < maxPat)
00216 {
00217 listSizes[bits] = curCount+1;
00218 lists[bits][curCount] = startBlock;
00219 }
00220 if (--blockMod == 0)
00221 {
00222 if (++startBlock >= maxBlockCount)
00223 tooManyBlocks();
00224 blockMod = blockSize;
00225 bp->bacIx = bacIx;
00226 bp->seqIx = seqIx;
00227 bp->seq = seq;
00228 bp->offset = i - (ps->seedSize-1) + 1;
00229 fixBlockSize(bp, startBlock);
00230 ++bp;
00231 }
00232 }
00233 for (i=0; i<ps->seedSize-1; ++i)
00234 {
00235 if (--blockMod == 0)
00236 {
00237 if (++startBlock >= maxBlockCount)
00238 tooManyBlocks();
00239 blockMod = blockSize;
00240 }
00241 }
00242 if (blockMod != blockSize)
00243 ++startBlock;
00244 return startBlock;
00245 }
00246
00247 struct patSpace *makePatSpace(
00248 struct dnaSeq **seqArray,
00249 int seqArrayCount,
00250 int seedSize,
00251 char *oocFileName,
00252 int minMatch,
00253 int maxGap
00254
00255 )
00256
00257
00258 {
00259 struct patSpace *ps = newPatSpace(minMatch, maxGap,seedSize);
00260 int i;
00261 int startIx = 0;
00262 int total = 0;
00263 struct dnaSeq *seq;
00264 psBits maxPat;
00265 psBits *listSizes;
00266 int seedSpaceSize = ps->seedSpaceSize;
00267
00268 maxPat = ps->maxPat = maxPatCount;
00269 for (i=0; i<seqArrayCount; ++i)
00270 {
00271 for (seq = seqArray[i]; seq != NULL; seq = seq->next)
00272 {
00273 total += seq->size;
00274 countPatSpace(ps, seq);
00275 }
00276 }
00277
00278 listSizes = ps->listSizes;
00279
00280
00281
00282 oocMaskCounts(oocFileName, listSizes, ps->seedSize, maxPat);
00283
00284
00285 oocMaskSimpleRepeats(listSizes, ps->seedSize, maxPat);
00286
00287
00288 allocPatSpaceLists(ps);
00289
00290
00291 for (i=0; i<ps->seedSpaceSize; ++i)
00292 {
00293 if (listSizes[i] < maxPat)
00294 listSizes[i] = 0;
00295 }
00296 for (i=0; i<seqArrayCount; ++i)
00297 {
00298 int j;
00299 for (seq = seqArray[i], j=0; seq != NULL; seq = seq->next, ++j)
00300 {
00301 startIx = addToPatSpace(ps, i, j, seq, startIx);
00302 if (startIx >= maxBlockCount)
00303 tooManyBlocks();
00304 }
00305 }
00306 ps->blocksUsed = startIx;
00307
00308
00309 for (i=0; i<seedSpaceSize; ++i)
00310 {
00311 if (listSizes[i] >= maxPat)
00312 listSizes[i] = 0;
00313 }
00314
00315 return ps;
00316 }
00317
00318
00319 void bfFind(struct patSpace *ps, int block, struct blockPos *ret)
00320
00321 {
00322 assert(block >= 0 && block < maxBlockCount);
00323 *ret = ps->blockPos[block];
00324 }
00325
00326
00327 static struct patClump *newClump(struct patSpace *ps, struct blockPos *first, struct blockPos *last)
00328
00329 {
00330 struct dnaSeq *seq = first->seq;
00331 int seqIx = first->seqIx;
00332 int bacIx = first->bacIx;
00333 int start = first->offset;
00334 int end = last->offset+last->size;
00335 int extraAtEnds = blockSize/2;
00336 struct patClump *cl;
00337
00338 start -= extraAtEnds;
00339 if (start < 0)
00340 start = 0;
00341 end += extraAtEnds;
00342 if (end >seq->size)
00343 end = seq->size;
00344 AllocVar(cl);
00345 cl->bacIx = bacIx;
00346 cl->seqIx = seqIx;
00347 cl->seq = seq;
00348 cl->start = start;
00349 cl->size = end-start;
00350 return cl;
00351 }
00352
00353
00354 static struct patClump *clumpHits(struct patSpace *ps,
00355 int hitBlocks[], int hitCount, int posBuf[],
00356 DNA *dna, int dnaSize)
00357
00358 {
00359
00360 int block;
00361 int i;
00362 int maxGap = ps->maxGap;
00363 struct blockPos first, cur, pre;
00364 struct patClump *patClump = NULL, *cl;
00365
00366 bfFind(ps, hitBlocks[0], &first);
00367 pre = first;
00368 for (i=1; i<hitCount; ++i)
00369 {
00370 block = hitBlocks[i];
00371 bfFind(ps, block, &cur);
00372 if (cur.seq != pre.seq || cur.offset - (pre.offset + pre.size) > maxGap)
00373 {
00374
00375 cl = newClump(ps, &first, &pre);
00376 slAddHead(&patClump, cl);
00377 first = cur;
00378 }
00379 else
00380 {
00381
00382 }
00383 pre = cur;
00384 }
00385
00386 cl = newClump(ps, &first, &pre);
00387 slAddHead(&patClump, cl);
00388 slReverse(&patClump);
00389 return patClump;
00390 }
00391
00392 struct patClump *patSpaceFindOne(struct patSpace *ps, DNA *dna, int dnaSize)
00393
00394 {
00395 int lastStart = dnaSize - ps->seedSize;
00396 int i,j;
00397 int pat;
00398 int hitBlockCount = 0;
00399 int totalSigHits = 0;
00400 DNA *tile = dna;
00401 int blocksUsed = ps->blocksUsed;
00402 int *posBuf = ps->posBuf;
00403 int *hitBlocks = ps->hitBlocks;
00404 int minMatch = ps->minMatch;
00405
00406 memset(ps->posBuf, 0, sizeof(ps->posBuf[0]) * blocksUsed);
00407 for (i=0; i<=lastStart; i += ps->seedSize)
00408 {
00409 psBits *list;
00410 psBits count;
00411
00412 pat = 0;
00413 for (j=0; j<ps->seedSize; ++j)
00414 {
00415 int bVal = ntValNoN[(int)tile[j]];
00416 pat <<= 2;
00417 pat += bVal;
00418 }
00419 list = ps->lists[pat];
00420 if ((count = ps->listSizes[pat]) > 0)
00421 {
00422 for (j=0; j<count; ++j)
00423 posBuf[list[j]] += 1;
00424 }
00425 tile += ps->seedSize;
00426 }
00427
00428
00429 for (i=0; i<blocksUsed-1; ++i)
00430 {
00431
00432 int a = posBuf[i], b = posBuf[i+1];
00433 int sum = a + b;
00434 if (sum >= minMatch)
00435 {
00436 if (a > 0)
00437 {
00438 if (hitBlockCount == 0 || hitBlocks[hitBlockCount-1] != i)
00439 {
00440 hitBlocks[hitBlockCount++] = i;
00441 totalSigHits += a;
00442 }
00443 }
00444 if (b > 0)
00445 {
00446 hitBlocks[hitBlockCount++] = i+1;
00447 totalSigHits += b;
00448 }
00449 }
00450 }
00451
00452
00453 if (hitBlockCount > 0 && totalSigHits*ps->seedSize*8 > dnaSize)
00454 {
00455 return clumpHits(ps, hitBlocks, hitBlockCount, posBuf, dna,
00456 dnaSize);
00457 }
00458 else
00459 return NULL;
00460 }
00461
00462
00463
00464