lib/twoBit.c

Go to the documentation of this file.
00001 #include "common.h"
00002 #include "hash.h"
00003 #include "dnaseq.h"
00004 #include "dnautil.h"
00005 #include "sig.h"
00006 #include "localmem.h"
00007 #include "linefile.h"
00008 #include "obscure.h"
00009 #include "twoBit.h"
00010 #include <limits.h>
00011 
00012 static char const rcsid[] = "$Id: twoBit.c,v 1.22 2006/07/31 22:18:22 angie Exp $";
00013 
00014 static int countBlocksOfN(char *s, int size)
00015 /* Count number of blocks of N's (or n's) in s. */
00016 {
00017 int i;
00018 boolean isN, lastIsN = FALSE;
00019 char c;
00020 int blockCount = 0;
00021 
00022 for (i=0; i<size; ++i)
00023     {
00024     c = s[i];
00025     isN = (c == 'n' || c == 'N');
00026     if (isN && !lastIsN)
00027         ++blockCount;
00028     lastIsN = isN;
00029     }
00030 return blockCount;
00031 }
00032 
00033 static int countBlocksOfLower(char *s, int size)
00034 /* Count number of blocks of lower case letters. */
00035 {
00036 int i;
00037 boolean isLower, lastIsLower = FALSE;
00038 int blockCount = 0;
00039 
00040 for (i=0; i<size; ++i)
00041     {
00042     isLower = islower(s[i]);
00043     if (isLower && !lastIsLower)
00044         ++blockCount;
00045     lastIsLower = isLower;
00046     }
00047 return blockCount;
00048 }
00049 
00050 static void storeBlocksOfN(char *s, int size, bits32 *starts, bits32 *sizes)
00051 /* Store starts and sizes of blocks of N's. */
00052 {
00053 int i;
00054 boolean isN, lastIsN = FALSE;
00055 int startN = 0;
00056 char c;
00057 
00058 for (i=0; i<size; ++i)
00059     {
00060     c = s[i];
00061     isN = (c == 'n' || c == 'N');
00062     if (isN)
00063         {
00064         if (!lastIsN)
00065             startN = i;
00066         }
00067     else
00068         {
00069         if (lastIsN)
00070             {
00071             *starts++ = startN;
00072             *sizes++ = i - startN;
00073             }
00074         }
00075     lastIsN = isN;
00076     }
00077 if (lastIsN)
00078     {
00079     *starts = startN;
00080     *sizes = i - startN;
00081     }
00082 }
00083 
00084 static void storeBlocksOfLower(char *s, int size, bits32 *starts, bits32 *sizes)
00085 /* Store starts and sizes of blocks of lower case letters. */
00086 {
00087 int i;
00088 boolean isLower, lastIsLower = FALSE;
00089 int startLower = 0;
00090 
00091 for (i=0; i<size; ++i)
00092     {
00093     isLower = islower(s[i]);
00094     if (isLower)
00095         {
00096         if (!lastIsLower)
00097             startLower = i;
00098         }
00099     else
00100         {
00101         if (lastIsLower)
00102             {
00103             *starts++ = startLower;
00104             *sizes++ = i - startLower;
00105             }
00106         }
00107     lastIsLower = isLower;
00108     }
00109 if (lastIsLower)
00110     {
00111     *starts = startLower;
00112     *sizes = i - startLower;
00113     }
00114 }
00115 
00116 static int packedSize(int unpackedSize)
00117 /* Return size when packed, rounding up. */
00118 {
00119 return ((unpackedSize + 3) >> 2);
00120 }
00121 
00122 struct twoBit *twoBitFromDnaSeq(struct dnaSeq *seq, boolean doMask)
00123 /* Convert dnaSeq representation in memory to twoBit representation.
00124  * If doMask is true interpret lower-case letters as masked. */
00125 {
00126 int ubyteSize = packedSize(seq->size);
00127 UBYTE *pt;
00128 struct twoBit *twoBit;
00129 DNA last4[4];   /* Holds few bases. */
00130 DNA *dna;
00131 int i, end;
00132 
00133 /* Allocate structure and fill in name. */
00134 AllocVar(twoBit);
00135 pt = AllocArray(twoBit->data, ubyteSize);
00136 twoBit->name = cloneString(seq->name);
00137 twoBit->size = seq->size;
00138 
00139 /* Convert to 4-bases per byte representation. */
00140 dna = seq->dna;
00141 end = seq->size - 4;
00142 for (i=0; i<end; i += 4)
00143     {
00144     *pt++ = packDna4(dna+i);
00145     }
00146 
00147 /* Take care of conversion of last few bases. */
00148 last4[0] = last4[1] = last4[2] = last4[3] = 'T';
00149 memcpy(last4, dna+i, seq->size-i);
00150 *pt = packDna4(last4);
00151 
00152 /* Deal with blocks of N. */
00153 twoBit->nBlockCount = countBlocksOfN(dna, seq->size);
00154 if (twoBit->nBlockCount > 0)
00155     {
00156     AllocArray(twoBit->nStarts, twoBit->nBlockCount);
00157     AllocArray(twoBit->nSizes, twoBit->nBlockCount);
00158     storeBlocksOfN(dna, seq->size, twoBit->nStarts, twoBit->nSizes);
00159     }
00160 
00161 /* Deal with masking */
00162 if (doMask)
00163     {
00164     twoBit->maskBlockCount = countBlocksOfLower(dna, seq->size);
00165     if (twoBit->maskBlockCount > 0)
00166         {
00167         AllocArray(twoBit->maskStarts, twoBit->maskBlockCount);
00168         AllocArray(twoBit->maskSizes, twoBit->maskBlockCount);
00169         storeBlocksOfLower(dna, seq->size, 
00170                 twoBit->maskStarts, twoBit->maskSizes);
00171         }
00172     }
00173 return twoBit;
00174 }
00175 
00176 
00177 static int twoBitSizeInFile(struct twoBit *twoBit)
00178 /* Figure out size structure will take in file. */
00179 {
00180 return packedSize(twoBit->size) 
00181         + sizeof(twoBit->size)
00182         + sizeof(twoBit->nBlockCount)
00183         + sizeof(twoBit->nStarts[0]) * twoBit->nBlockCount
00184         + sizeof(twoBit->nSizes[0]) * twoBit->nBlockCount
00185         + sizeof(twoBit->maskBlockCount)
00186         + sizeof(twoBit->maskStarts[0]) * twoBit->maskBlockCount
00187         + sizeof(twoBit->maskSizes[0]) * twoBit->maskBlockCount
00188         + sizeof(twoBit->reserved);
00189 }
00190 
00191 void twoBitWriteOne(struct twoBit *twoBit, FILE *f)
00192 /* Write out one twoBit sequence to binary file. 
00193  * Note this does not include the name, which is
00194  * stored only in index. */
00195 {
00196 writeOne(f, twoBit->size);
00197 writeOne(f, twoBit->nBlockCount);
00198 if (twoBit->nBlockCount > 0)
00199     {
00200     fwrite(twoBit->nStarts, sizeof(twoBit->nStarts[0]), 
00201         twoBit->nBlockCount, f);
00202     fwrite(twoBit->nSizes, sizeof(twoBit->nSizes[0]), 
00203         twoBit->nBlockCount, f);
00204     }
00205 writeOne(f, twoBit->maskBlockCount);
00206 if (twoBit->maskBlockCount > 0)
00207     {
00208     fwrite(twoBit->maskStarts, sizeof(twoBit->maskStarts[0]), 
00209         twoBit->maskBlockCount, f);
00210     fwrite(twoBit->maskSizes, sizeof(twoBit->maskSizes[0]), 
00211         twoBit->maskBlockCount, f);
00212     }
00213 writeOne(f, twoBit->reserved);
00214 mustWrite(f, twoBit->data, packedSize(twoBit->size));
00215 }
00216 
00217 void twoBitWriteHeader(struct twoBit *twoBitList, FILE *f)
00218 /* Write out header portion of twoBit file, including initial
00219  * index */
00220 {
00221 bits32 sig = twoBitSig;
00222 bits32 version = 0;
00223 bits32 seqCount = slCount(twoBitList);
00224 bits32 reserved = 0;
00225 bits32 offset = 0;
00226 struct twoBit *twoBit;
00227 long long counter = 0; /* check for 32 bit overflow */
00228 
00229 /* Write out fixed parts of header. */
00230 writeOne(f, sig);
00231 writeOne(f, version);
00232 writeOne(f, seqCount);
00233 writeOne(f, reserved);
00234 
00235 /* Figure out location of first byte past index.
00236  * Each index entry contains 4 bytes of offset information
00237  * and the name of the sequence, which is variable length. */
00238 offset = sizeof(sig) + sizeof(version) + sizeof(seqCount) + sizeof(reserved);
00239 for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next)
00240     {
00241     int nameLen = strlen(twoBit->name);
00242     if (nameLen > 255)
00243         errAbort("name %s too long", twoBit->name);
00244     offset += nameLen + 1 + sizeof(bits32);
00245     }
00246 
00247 /* Write out index. */
00248 for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next)
00249     {
00250     int size = twoBitSizeInFile(twoBit);
00251     writeString(f, twoBit->name);
00252     writeOne(f, offset);
00253     offset += size;
00254     counter += (long long)size;
00255     if (counter > UINT_MAX )
00256         errAbort("Error in faToTwoBit, index overflow at %s. The 2bit format "
00257                 "does not support indexes larger than %dGb, \n"
00258                 "please split up into smaller files.\n", 
00259                 twoBit->name, UINT_MAX/1000000000);
00260     }
00261 }
00262 
00263 void twoBitClose(struct twoBitFile **pTbf)
00264 /* Free up resources associated with twoBitFile. */
00265 {
00266 struct twoBitFile *tbf = *pTbf;
00267 if (tbf != NULL)
00268     {
00269     freez(&tbf->fileName);
00270     carefulClose(&tbf->f);
00271     hashFree(&tbf->hash);
00272     /* The indexList is allocated out of the hash's memory pool. */
00273     freez(pTbf);
00274     }
00275 }
00276 
00277 static bits32 readBits32(FILE *f, boolean isSwapped)
00278 /* Read and optionally byte-swap 32 bit entity. */
00279 {
00280 bits32 val;
00281 mustReadOne(f, val);
00282 if (isSwapped) 
00283     val = byteSwap32(val);
00284 return val;
00285 }
00286 
00287 struct twoBitFile *twoBitOpen(char *fileName)
00288 /* Open file, read in header and index.  
00289  * Squawk and die if there is a problem. */
00290 {
00291 bits32 sig;
00292 struct twoBitFile *tbf;
00293 struct twoBitIndex *index;
00294 boolean isSwapped = FALSE;
00295 int i;
00296 struct hash *hash;
00297 FILE *f = mustOpen(fileName, "rb");
00298 
00299 /* Allocate header verify signature, and read in
00300  * the constant-length bits. */
00301 AllocVar(tbf);
00302 mustReadOne(f, sig);
00303 if (sig == twoBitSwapSig)
00304     isSwapped = tbf->isSwapped = TRUE;
00305 else if (sig != twoBitSig)
00306     errAbort("%s doesn't have a valid twoBitSig", fileName);
00307 tbf->fileName = cloneString(fileName);
00308 tbf->f = f;
00309 tbf->version = readBits32(f, isSwapped);
00310 if (tbf->version != 0)
00311     {
00312     errAbort("Can only handle version 0 of this file. This is version %d",
00313         (int)tbf->version);
00314     }
00315 tbf->seqCount = readBits32(f, isSwapped);
00316 tbf->reserved = readBits32(f, isSwapped);
00317 
00318 /* Read in index. */
00319 hash = tbf->hash = hashNew(digitsBaseTwo(tbf->seqCount));
00320 for (i=0; i<tbf->seqCount; ++i)
00321     {
00322     char name[256];
00323     if (!fastReadString(f, name))
00324         errAbort("%s is truncated", fileName);
00325     lmAllocVar(hash->lm, index);
00326     index->offset = readBits32(f, isSwapped);
00327     hashAddSaveName(hash, name, index, &index->name);
00328     slAddHead(&tbf->indexList, index);
00329     }
00330 slReverse(&tbf->indexList);
00331 return tbf;
00332 }
00333 
00334 
00335 static int findGreatestLowerBound(int blockCount, bits32 *pos, 
00336         int val)
00337 /* Find index of greatest element in posArray that is less 
00338  * than or equal to val using a binary search. */
00339 {
00340 int startIx=0, endIx=blockCount-1, midIx;
00341 int posVal;
00342 
00343 for (;;)
00344     {
00345     if (startIx == endIx)
00346         {
00347         posVal = pos[startIx];
00348         if (posVal <= val || startIx == 0)
00349             return startIx;
00350         else
00351             return startIx-1;
00352         }
00353     midIx = ((startIx + endIx)>>1);
00354     posVal = pos[midIx];
00355     if (posVal < val)
00356         startIx = midIx+1;
00357     else
00358         endIx = midIx;
00359     }
00360 }
00361 
00362 static void twoBitSeekTo(struct twoBitFile *tbf, char *name)
00363 /* Seek to start of named record.  Abort if can't find it. */
00364 {
00365 struct twoBitIndex *index = hashFindVal(tbf->hash, name);
00366 if (index == NULL)
00367      errAbort("%s is not in %s", name, tbf->fileName);
00368 fseek(tbf->f, index->offset, SEEK_SET);
00369 }
00370 
00371 static void readBlockCoords(FILE *f, boolean isSwapped, bits32 *retBlockCount,
00372                             bits32 **retBlockStarts, bits32 **retBlockSizes)
00373 /* Read in blockCount, starts and sizes from file. (Same structure used for
00374  * both blocks of N's and masked blocks.) */
00375 {
00376 bits32 blkCount = readBits32(f, isSwapped);
00377 *retBlockCount = blkCount;
00378 if (blkCount == 0)
00379     {
00380     *retBlockStarts = NULL;
00381     *retBlockSizes = NULL;
00382     }
00383 else
00384     {
00385     bits32 *nStarts, *nSizes;
00386     AllocArray(nStarts, blkCount);
00387     AllocArray(nSizes, blkCount);
00388     fread(nStarts, sizeof(nStarts[0]), blkCount, f);
00389     fread(nSizes, sizeof(nSizes[0]), blkCount, f);
00390     if (isSwapped)
00391         {
00392         int i;
00393         for (i=0; i<blkCount; ++i)
00394             {
00395             nStarts[i] = byteSwap32(nStarts[i]);
00396             nSizes[i] = byteSwap32(nSizes[i]);
00397             }
00398         }
00399     *retBlockStarts = nStarts;
00400     *retBlockSizes = nSizes;
00401     }
00402 }
00403 
00404 struct twoBit *twoBitFromFile(char *fileName)
00405 /* Get twoBit list of all sequences in twoBit file. */
00406 {
00407 struct twoBitFile *tbf = twoBitOpen(fileName);
00408 struct twoBitIndex *index;
00409 struct twoBit *twoBitList = NULL, *twoBit = NULL;
00410 boolean isSwapped = tbf->isSwapped;
00411 FILE *f = tbf->f;
00412 
00413 for (index = tbf->indexList; index != NULL; index = index->next)
00414     {
00415     char *name = index->name;
00416     bits32 packByteCount;
00417 
00418     AllocVar(twoBit);
00419     twoBit->name = cloneString(name);
00420 
00421     /* Find offset in index and seek to it */
00422     twoBitSeekTo(tbf, name);
00423 
00424     /* Read in seqSize. */
00425     twoBit->size = readBits32(f, isSwapped);
00426 
00427     /* Read in blocks of N. */
00428     readBlockCoords(f, isSwapped, &(twoBit->nBlockCount),
00429                     &(twoBit->nStarts), &(twoBit->nSizes));
00430 
00431     /* Read in masked blocks. */
00432     readBlockCoords(f, isSwapped, &(twoBit->maskBlockCount),
00433                     &(twoBit->maskStarts), &(twoBit->maskSizes));
00434 
00435     /* Reserved word. */
00436     twoBit->reserved = readBits32(f, isSwapped);
00437 
00438     /* Read in data. */
00439     packByteCount = packedSize(twoBit->size);
00440     twoBit->data = needLargeMem(packByteCount);
00441     mustRead(f, twoBit->data, packByteCount);
00442 
00443     slAddHead(&twoBitList, twoBit);
00444     }
00445 
00446 twoBitClose(&tbf);
00447 slReverse(&twoBitList);
00448 return twoBitList;
00449 }
00450 
00451 struct dnaSeq *twoBitReadSeqFragExt(struct twoBitFile *tbf, char *name,
00452         int fragStart, int fragEnd, boolean doMask, int *retFullSize)
00453 /* Read part of sequence from .2bit file.  To read full
00454  * sequence call with start=end=0.  Sequence will be lower
00455  * case if doMask is false, mixed case (repeats in lower)
00456  * if doMask is true. */
00457 {
00458 struct dnaSeq *seq;
00459 bits32 seqSize;
00460 bits32 nBlockCount, maskBlockCount;
00461 bits32 *nStarts = NULL, *nSizes = NULL;
00462 bits32 *maskStarts = NULL, *maskSizes = NULL;
00463 boolean isSwapped = tbf->isSwapped;
00464 FILE *f = tbf->f;
00465 int i;
00466 int packByteCount, packedStart, packedEnd, remainder, midStart, midEnd;
00467 int outSize;
00468 UBYTE *packed, *packedAlloc;
00469 DNA *dna;
00470 
00471 /* Find offset in index and seek to it */
00472 dnaUtilOpen();
00473 twoBitSeekTo(tbf, name);
00474 
00475 /* Read in seqSize. */
00476 seqSize = readBits32(f, isSwapped);
00477 if (fragEnd == 0)
00478     fragEnd = seqSize;
00479 if (fragEnd > seqSize)
00480     errAbort("twoBitReadSeqFrag in %s end (%d) >= seqSize (%d)", name, fragEnd, seqSize);
00481 outSize = fragEnd - fragStart;
00482 if (outSize < 1)
00483     errAbort("twoBitReadSeqFrag in %s start (%d) >= end (%d)", name, fragStart, fragEnd);
00484 
00485 /* Read in blocks of N. */
00486 readBlockCoords(f, isSwapped, &nBlockCount, &nStarts, &nSizes);
00487 
00488 /* Read in masked blocks. */
00489 readBlockCoords(f, isSwapped, &maskBlockCount, &maskStarts, &maskSizes);
00490 
00491 /* Skip over reserved word. */
00492 readBits32(f, isSwapped);
00493 
00494 /* Allocate dnaSeq, and fill in zero tag at end of sequence. */
00495 AllocVar(seq);
00496 if (outSize == seqSize)
00497     seq->name = cloneString(name);
00498 else
00499     {
00500     char buf[256*2];
00501     safef(buf, sizeof(buf), "%s:%d-%d", name, fragStart, fragEnd);
00502     seq->name = cloneString(buf);
00503     }
00504 seq->size = outSize;
00505 dna = seq->dna = needLargeMem(outSize+1);
00506 seq->dna[outSize] = 0;
00507 
00508 
00509 /* Skip to bits we need and read them in. */
00510 packedStart = (fragStart>>2);
00511 packedEnd = ((fragEnd+3)>>2);
00512 packByteCount = packedEnd - packedStart;
00513 packed = packedAlloc = needLargeMem(packByteCount);
00514 fseek(f, packedStart, SEEK_CUR);
00515 mustRead(f, packed, packByteCount);
00516 
00517 /* Handle case where everything is in one packed byte */
00518 if (packByteCount == 1)
00519     {
00520     int pOff = (packedStart<<2);
00521     int pStart = fragStart - pOff;
00522     int pEnd = fragEnd - pOff;
00523     UBYTE partial = *packed;
00524     assert(pEnd <= 4);
00525     assert(pStart >= 0);
00526     for (i=pStart; i<pEnd; ++i)
00527         *dna++ = valToNt[(partial >> (6-i-i)) & 3];
00528     }
00529 else
00530     {
00531     /* Handle partial first packed byte. */
00532     midStart = fragStart;
00533     remainder = (fragStart&3);
00534     if (remainder > 0)
00535         {
00536         UBYTE partial = *packed++;
00537         int partCount = 4 - remainder;
00538         for (i=partCount-1; i>=0; --i)
00539             {
00540             dna[i] = valToNt[partial&3];
00541             partial >>= 2;
00542             }
00543         midStart += partCount;
00544         dna += partCount;
00545         }
00546 
00547     /* Handle middle bytes. */
00548     remainder = fragEnd&3;
00549     midEnd = fragEnd - remainder;
00550     for (i=midStart; i<midEnd; i += 4)
00551         {
00552         UBYTE b = *packed++;
00553         dna[3] = valToNt[b&3];
00554         b >>= 2;
00555         dna[2] = valToNt[b&3];
00556         b >>= 2;
00557         dna[1] = valToNt[b&3];
00558         b >>= 2;
00559         dna[0] = valToNt[b&3];
00560         dna += 4;
00561         }
00562 
00563     if (remainder >0)
00564         {
00565         UBYTE part = *packed;
00566         part >>= (8-remainder-remainder);
00567         for (i=remainder-1; i>=0; --i)
00568             {
00569             dna[i] = valToNt[part&3];
00570             part >>= 2;
00571             }
00572         }
00573     }
00574 freez(&packedAlloc);
00575 
00576 if (nBlockCount > 0)
00577     {
00578     int startIx = findGreatestLowerBound(nBlockCount, nStarts, fragStart);
00579     for (i=startIx; i<nBlockCount; ++i)
00580         {
00581         int s = nStarts[i];
00582         int e = s + nSizes[i];
00583         if (s >= fragEnd)
00584             break;
00585         if (s < fragStart)
00586            s = fragStart;
00587         if (e > fragEnd)
00588            e = fragEnd;
00589         if (s < e)
00590             memset(seq->dna + s - fragStart, 'n', e - s);
00591         }
00592     }
00593 
00594 if (doMask)
00595     {
00596     toUpperN(seq->dna, seq->size);
00597     if (maskBlockCount > 0)
00598         {
00599         int startIx = findGreatestLowerBound(maskBlockCount, maskStarts, 
00600                 fragStart);
00601         for (i=startIx; i<maskBlockCount; ++i)
00602             {
00603             int s = maskStarts[i];
00604             int e = s + maskSizes[i];
00605             if (s >= fragEnd)
00606                 break;
00607             if (s < fragStart)
00608                 s = fragStart;
00609             if (e > fragEnd)
00610                 e = fragEnd;
00611             if (s < e)
00612                 toLowerN(seq->dna + s - fragStart, e - s);
00613             }
00614         }
00615     }
00616 freez(&nStarts);
00617 freez(&nSizes);
00618 freez(&maskStarts);
00619 freez(&maskSizes);
00620 if (retFullSize != NULL)
00621     *retFullSize = seqSize;
00622 return seq;
00623 }
00624 
00625 struct dnaSeq *twoBitReadSeqFrag(struct twoBitFile *tbf, char *name,
00626         int fragStart, int fragEnd)
00627 /* Read part of sequence from .2bit file.  To read full
00628  * sequence call with start=end=0.  Note that sequence will
00629  * be mixed case, with repeats in lower case and rest in
00630  * upper case. */
00631 {
00632 return twoBitReadSeqFragExt(tbf, name, fragStart, fragEnd, TRUE, NULL);
00633 }
00634 
00635 struct dnaSeq *twoBitReadSeqFragLower(struct twoBitFile *tbf, char *name,
00636         int fragStart, int fragEnd)
00637 /* Same as twoBitReadSeqFrag, but sequence is returned in lower case. */
00638 {
00639 return twoBitReadSeqFragExt(tbf, name, fragStart, fragEnd, FALSE, NULL);
00640 }
00641 
00642 int twoBitSeqSize(struct twoBitFile *tbf, char *name)
00643 /* Return size of sequence in two bit file in bases. */
00644 {
00645 twoBitSeekTo(tbf, name);
00646 return readBits32(tbf->f, tbf->isSwapped);
00647 }
00648 
00649 long long twoBitTotalSize(struct twoBitFile *tbf)
00650 /* Return total size of all sequences in two bit file. */
00651 {
00652 struct twoBitIndex *index;
00653 long long totalSize = 0;
00654 for (index = tbf->indexList; index != NULL; index = index->next)
00655     {
00656     fseek(tbf->f, index->offset, SEEK_SET);
00657     totalSize += readBits32(tbf->f, tbf->isSwapped);
00658     }
00659 return totalSize;
00660 }
00661 
00662 struct dnaSeq *twoBitLoadAll(char *spec)
00663 /* Return list of all sequences matching spec, which is in
00664  * the form:
00665  *
00666  *    file/path/input.2bit[:seqSpec1][,seqSpec2,...]
00667  *
00668  * where seqSpec is either
00669  *     seqName
00670  *  or
00671  *     seqName:start-end */
00672 {
00673 struct twoBitSpec *tbs = twoBitSpecNew(spec);
00674 struct twoBitFile *tbf = twoBitOpen(tbs->fileName);
00675 struct dnaSeq *list = NULL;
00676 if (tbs->seqs != NULL)
00677     {
00678     struct twoBitSeqSpec *tbss;
00679     for (tbss = tbs->seqs; tbss != NULL; tbss = tbss->next)
00680         slSafeAddHead(&list, twoBitReadSeqFrag(tbf, tbss->name,
00681                                                tbss->start, tbss->end));
00682     }
00683 else
00684     {
00685     struct twoBitIndex *index;
00686     for (index = tbf->indexList; index != NULL; index = index->next)
00687         slSafeAddHead(&list, twoBitReadSeqFrag(tbf, index->name, 0, 0));
00688     }
00689 slReverse(&list);
00690 twoBitClose(&tbf);
00691 twoBitSpecFree(&tbs);
00692 return list;
00693 }
00694 
00695 struct slName *twoBitSeqNames(char *fileName)
00696 /* Get list of all sequences in twoBit file. */
00697 {
00698 struct twoBitFile *tbf = twoBitOpen(fileName);
00699 struct twoBitIndex *index;
00700 struct slName *name, *list = NULL;
00701 for (index = tbf->indexList; index != NULL; index = index->next)
00702     {
00703     name = slNameNew(index->name);
00704     slAddHead(&list, name);
00705     }
00706 twoBitClose(&tbf);
00707 slReverse(&list);
00708 return list;
00709 }
00710 
00711 boolean twoBitIsFile(char *fileName)
00712 /* Return TRUE if file is in .2bit format. */
00713 {
00714 return endsWith(fileName, ".2bit");
00715 }
00716 
00717 boolean twoBitParseRange(char *rangeSpec, char **retFile, 
00718         char **retSeq, int *retStart, int *retEnd)
00719 /* Parse out something in format
00720  *    file/path/name:seqName:start-end
00721  * or
00722  *    file/path/name:seqName
00723  * or
00724  *    file/path/name:seqName1,seqName2,seqName3,...
00725  * This will destroy the input 'rangeSpec' in the process.  Returns FALSE if
00726  * it doesn't fit this format, setting retFile to rangeSpec, and retSet to
00727  * null.  If it is the shorter form then start and end will both be returned
00728  * as zero, which is ok by twoBitReadSeqFrag.  Any of the return arguments
00729  * maybe NULL.
00730  */
00731 {
00732 char *s, *e;
00733 int n;
00734 
00735 /* default returns */
00736 if (retFile != NULL)
00737     *retFile = rangeSpec;
00738 if (retSeq != NULL)
00739     *retSeq = NULL;
00740 if (retStart != NULL)
00741     *retStart = 0;
00742 if (retEnd != NULL)
00743     *retEnd = 0;
00744 
00745 /* start with final name  */
00746 s = strrchr(rangeSpec, '/');
00747 if (s == NULL)
00748     s = rangeSpec;
00749 else
00750     s++;
00751 
00752 /* Grab seqName, zero terminate fileName. */
00753 s = strchr(s, ':');
00754 if (s == NULL)
00755     return FALSE;
00756 *s++ = 0;
00757 if (retSeq != NULL)
00758     *retSeq = s;
00759 
00760 /* Grab start, zero terminate seqName. */
00761 s = strchr(s, ':');
00762 if (s == NULL)
00763     return TRUE;  /* no range spec */
00764 *s++ = 0;
00765 n = strtol(s, &e, 0);
00766 if (*e != '-')
00767     return FALSE; /* not a valid range */
00768 if (retStart != NULL)
00769     *retStart = n;
00770 s = e+1;
00771 
00772 /* Grab end. */
00773 n = strtol(s, &e, 0);
00774 if (*e != '\0')
00775     return FALSE; /* not a valid range */
00776 if (retEnd != NULL)
00777     *retEnd = n;
00778 return TRUE;
00779 }
00780 
00781 boolean twoBitIsRange(char *rangeSpec)
00782 /* Return TRUE if it looks like a two bit range specifier. */
00783 {
00784 char *dupe = cloneString(rangeSpec);
00785 char *file, *seq;
00786 int start, end;
00787 boolean isRange = twoBitParseRange(dupe, &file, &seq, &start, &end);
00788 if (isRange)
00789     isRange = twoBitIsFile(file);
00790 freeMem(dupe);
00791 return isRange;
00792 }
00793 
00794 boolean twoBitIsFileOrRange(char *spec)
00795 /* Return TRUE if it is a two bit file or subrange. */
00796 {
00797 return twoBitIsFile(spec) || twoBitIsRange(spec);
00798 }
00799 
00800 static struct twoBitSeqSpec *parseSeqSpec(char *seqSpecStr)
00801 /* parse one sequence spec */
00802 {
00803 boolean isOk = TRUE;
00804 char *s, *e;
00805 struct twoBitSeqSpec *seq;
00806 AllocVar(seq);
00807 seq->name = cloneString(seqSpecStr);
00808 
00809 /* Grab start */
00810 s = strchr(seq->name, ':');
00811 if (s == NULL)
00812     return seq;  /* no range spec */
00813 *s++ = 0;
00814 seq->start = strtol(s, &e, 0);
00815 if (*e != '-')
00816     isOk = FALSE;
00817 else
00818     {
00819     /* Grab end */
00820     s = e+1;
00821     seq->end = strtol(s, &e, 0);
00822     if (*e != '\0')
00823         isOk = FALSE;
00824     }
00825 if (!isOk || (seq->end < seq->start))
00826     errAbort("invalid twoBit sequence specification: \"%s\"", seqSpecStr);
00827 return seq;
00828 }
00829 
00830 boolean twoBitIsSpec(char *spec)
00831 /* Return TRUE spec is a valid 2bit spec (see twoBitSpecNew) */
00832 {
00833 struct twoBitSpec *tbs = twoBitSpecNew(spec);
00834 boolean isSpec = (tbs != NULL);
00835 twoBitSpecFree(&tbs);
00836 return isSpec;
00837 }
00838 
00839 struct twoBitSpec *twoBitSpecNew(char *specStr)
00840 /* Parse a .2bit file and sequence spec into an object.
00841  * The spec is a string in the form:
00842  *
00843  *    file/path/input.2bit[:seqSpec1][,seqSpec2,...]
00844  *
00845  * where seqSpec is either
00846  *     seqName
00847  *  or
00848  *     seqName:start-end
00849  *
00850  * free result with twoBitSpecFree().
00851  */
00852 {
00853 char *s, *e;
00854 int i, numSeqs;
00855 char **seqSpecs;
00856 struct twoBitSpec *spec;
00857 AllocVar(spec);
00858 spec->fileName = cloneString(specStr);
00859 
00860 /* start with final file name  */
00861 s = strrchr(spec->fileName, '/');
00862 if (s == NULL)
00863     s = spec->fileName;
00864 else
00865     s++;
00866 
00867 /* find end of file name and zero-terminate */
00868 e = strchr(s, ':');
00869 if (e == NULL)
00870     s = NULL; /* just file name */
00871 else
00872     {
00873     *e++ = '\0';
00874     s = e;
00875     }
00876 if (!endsWith(spec->fileName, ".2bit"))
00877     {
00878     twoBitSpecFree(&spec);
00879     return NULL; /* not a 2bit file */
00880     }
00881 
00882 if (s != NULL)
00883     {
00884     /* chop seqs at commas and parse */
00885     numSeqs = chopString(s, ",", NULL, 0);
00886     AllocArray(seqSpecs, numSeqs);
00887     chopString(s, ",", seqSpecs, numSeqs);
00888     for (i = 0; i< numSeqs; i++)
00889         slSafeAddHead(&spec->seqs, parseSeqSpec(seqSpecs[i]));
00890     slReverse(&spec->seqs);
00891     }
00892 return spec;
00893 }
00894 
00895 struct twoBitSpec *twoBitSpecNewFile(char *twoBitFile, char *specFile)
00896 /* parse a file containing a list of specifications for sequences in the
00897  * specified twoBit file. Specifications are one per line in forms:
00898  *     seqName
00899  *  or
00900  *     seqName:start-end
00901  */
00902 {
00903 struct lineFile *lf = lineFileOpen(specFile, TRUE);
00904 char *line;
00905 struct twoBitSpec *spec;
00906 AllocVar(spec);
00907 spec->fileName = cloneString(twoBitFile);
00908 while (lineFileNextReal(lf, &line))
00909     slSafeAddHead(&spec->seqs, parseSeqSpec(trimSpaces(line)));
00910 slReverse(&spec->seqs);
00911 lineFileClose(&lf);
00912 return spec;
00913 }
00914 
00915 void twoBitSpecFree(struct twoBitSpec **specPtr)
00916 /* free a twoBitSpec object */
00917 {
00918 struct twoBitSpec *spec = *specPtr;
00919 if (spec != NULL)
00920     {
00921     struct twoBitSeqSpec *seq;
00922     while ((seq = slPopHead(&spec->seqs)) != NULL)
00923         {
00924         freeMem(seq->name);
00925         freeMem(seq);
00926         }
00927     freeMem(spec->fileName);
00928     freeMem(spec);
00929     *specPtr = NULL;
00930     }
00931 }
00932 
00933 void twoBitOutNBeds(struct twoBitFile *tbf, char *seqName, FILE *outF)
00934 /* output a series of bed3's that enumerate the number of N's in a sequence*/
00935 {
00936 int nBlockCount;
00937 
00938 twoBitSeekTo(tbf, seqName);
00939 
00940 readBits32(tbf->f, tbf->isSwapped);
00941 
00942 /* Read in blocks of N. */
00943 nBlockCount = readBits32(tbf->f, tbf->isSwapped);
00944 
00945 if (nBlockCount > 0)
00946     {
00947     bits32 *nStarts = NULL, *nSizes = NULL;
00948     int i;
00949 
00950     AllocArray(nStarts, nBlockCount);
00951     AllocArray(nSizes, nBlockCount);
00952     fread(nStarts, sizeof(nStarts[0]), nBlockCount, tbf->f);
00953     fread(nSizes, sizeof(nSizes[0]), nBlockCount, tbf->f);
00954     if (tbf->isSwapped)
00955         {
00956         for (i=0; i<nBlockCount; ++i)
00957             {
00958             nStarts[i] = byteSwap32(nStarts[i]);
00959             nSizes[i] = byteSwap32(nSizes[i]);
00960             }
00961         }
00962 
00963     for (i=0; i<nBlockCount; ++i)
00964         {
00965         fprintf(outF, "%s\t%d\t%d\n", seqName, nStarts[i], nStarts[i] + nSizes[i]);
00966         }
00967 
00968     freez(&nStarts);
00969     freez(&nSizes);
00970     }
00971 }
00972 
00973 int twoBitSeqSizeNoNs(struct twoBitFile *tbf, char *seqName)
00974 /* return the size of the sequence, not counting N's*/
00975 {
00976 int nBlockCount;
00977 int size;
00978 
00979 twoBitSeekTo(tbf, seqName);
00980 
00981 size = readBits32(tbf->f, tbf->isSwapped);
00982 
00983 /* Read in blocks of N. */
00984 nBlockCount = readBits32(tbf->f, tbf->isSwapped);
00985 
00986 if (nBlockCount > 0)
00987     {
00988     bits32 *nStarts = NULL, *nSizes = NULL;
00989     
00990     int i;
00991 
00992     AllocArray(nStarts, nBlockCount);
00993     AllocArray(nSizes, nBlockCount);
00994     fread(nStarts, sizeof(nStarts[0]), nBlockCount, tbf->f);
00995     fread(nSizes, sizeof(nSizes[0]), nBlockCount, tbf->f);
00996     if (tbf->isSwapped)
00997         {
00998         for (i=0; i<nBlockCount; ++i)
00999             {
01000             nStarts[i] = byteSwap32(nStarts[i]);
01001             nSizes[i] = byteSwap32(nSizes[i]);
01002             }
01003         }
01004 
01005     for (i=0; i<nBlockCount; ++i)
01006         {
01007         size -= nSizes[i];
01008         }
01009 
01010     freez(&nStarts);
01011     freez(&nSizes);
01012     }
01013 
01014 return(size);
01015 }

Generated on Tue Dec 25 18:39:32 2007 for blat by  doxygen 1.5.2