00001 #include "common.h"
00002 #include "hash.h"
00003 #include "dnaseq.h"
00004 #include "dnautil.h"
00005 #include "sig.h"
00006 #include "localmem.h"
00007 #include "linefile.h"
00008 #include "obscure.h"
00009 #include "twoBit.h"
00010 #include <limits.h>
00011
00012 static char const rcsid[] = "$Id: twoBit.c,v 1.22 2006/07/31 22:18:22 angie Exp $";
00013
00014 static int countBlocksOfN(char *s, int size)
00015
00016 {
00017 int i;
00018 boolean isN, lastIsN = FALSE;
00019 char c;
00020 int blockCount = 0;
00021
00022 for (i=0; i<size; ++i)
00023 {
00024 c = s[i];
00025 isN = (c == 'n' || c == 'N');
00026 if (isN && !lastIsN)
00027 ++blockCount;
00028 lastIsN = isN;
00029 }
00030 return blockCount;
00031 }
00032
00033 static int countBlocksOfLower(char *s, int size)
00034
00035 {
00036 int i;
00037 boolean isLower, lastIsLower = FALSE;
00038 int blockCount = 0;
00039
00040 for (i=0; i<size; ++i)
00041 {
00042 isLower = islower(s[i]);
00043 if (isLower && !lastIsLower)
00044 ++blockCount;
00045 lastIsLower = isLower;
00046 }
00047 return blockCount;
00048 }
00049
00050 static void storeBlocksOfN(char *s, int size, bits32 *starts, bits32 *sizes)
00051
00052 {
00053 int i;
00054 boolean isN, lastIsN = FALSE;
00055 int startN = 0;
00056 char c;
00057
00058 for (i=0; i<size; ++i)
00059 {
00060 c = s[i];
00061 isN = (c == 'n' || c == 'N');
00062 if (isN)
00063 {
00064 if (!lastIsN)
00065 startN = i;
00066 }
00067 else
00068 {
00069 if (lastIsN)
00070 {
00071 *starts++ = startN;
00072 *sizes++ = i - startN;
00073 }
00074 }
00075 lastIsN = isN;
00076 }
00077 if (lastIsN)
00078 {
00079 *starts = startN;
00080 *sizes = i - startN;
00081 }
00082 }
00083
00084 static void storeBlocksOfLower(char *s, int size, bits32 *starts, bits32 *sizes)
00085
00086 {
00087 int i;
00088 boolean isLower, lastIsLower = FALSE;
00089 int startLower = 0;
00090
00091 for (i=0; i<size; ++i)
00092 {
00093 isLower = islower(s[i]);
00094 if (isLower)
00095 {
00096 if (!lastIsLower)
00097 startLower = i;
00098 }
00099 else
00100 {
00101 if (lastIsLower)
00102 {
00103 *starts++ = startLower;
00104 *sizes++ = i - startLower;
00105 }
00106 }
00107 lastIsLower = isLower;
00108 }
00109 if (lastIsLower)
00110 {
00111 *starts = startLower;
00112 *sizes = i - startLower;
00113 }
00114 }
00115
00116 static int packedSize(int unpackedSize)
00117
00118 {
00119 return ((unpackedSize + 3) >> 2);
00120 }
00121
00122 struct twoBit *twoBitFromDnaSeq(struct dnaSeq *seq, boolean doMask)
00123
00124
00125 {
00126 int ubyteSize = packedSize(seq->size);
00127 UBYTE *pt;
00128 struct twoBit *twoBit;
00129 DNA last4[4];
00130 DNA *dna;
00131 int i, end;
00132
00133
00134 AllocVar(twoBit);
00135 pt = AllocArray(twoBit->data, ubyteSize);
00136 twoBit->name = cloneString(seq->name);
00137 twoBit->size = seq->size;
00138
00139
00140 dna = seq->dna;
00141 end = seq->size - 4;
00142 for (i=0; i<end; i += 4)
00143 {
00144 *pt++ = packDna4(dna+i);
00145 }
00146
00147
00148 last4[0] = last4[1] = last4[2] = last4[3] = 'T';
00149 memcpy(last4, dna+i, seq->size-i);
00150 *pt = packDna4(last4);
00151
00152
00153 twoBit->nBlockCount = countBlocksOfN(dna, seq->size);
00154 if (twoBit->nBlockCount > 0)
00155 {
00156 AllocArray(twoBit->nStarts, twoBit->nBlockCount);
00157 AllocArray(twoBit->nSizes, twoBit->nBlockCount);
00158 storeBlocksOfN(dna, seq->size, twoBit->nStarts, twoBit->nSizes);
00159 }
00160
00161
00162 if (doMask)
00163 {
00164 twoBit->maskBlockCount = countBlocksOfLower(dna, seq->size);
00165 if (twoBit->maskBlockCount > 0)
00166 {
00167 AllocArray(twoBit->maskStarts, twoBit->maskBlockCount);
00168 AllocArray(twoBit->maskSizes, twoBit->maskBlockCount);
00169 storeBlocksOfLower(dna, seq->size,
00170 twoBit->maskStarts, twoBit->maskSizes);
00171 }
00172 }
00173 return twoBit;
00174 }
00175
00176
00177 static int twoBitSizeInFile(struct twoBit *twoBit)
00178
00179 {
00180 return packedSize(twoBit->size)
00181 + sizeof(twoBit->size)
00182 + sizeof(twoBit->nBlockCount)
00183 + sizeof(twoBit->nStarts[0]) * twoBit->nBlockCount
00184 + sizeof(twoBit->nSizes[0]) * twoBit->nBlockCount
00185 + sizeof(twoBit->maskBlockCount)
00186 + sizeof(twoBit->maskStarts[0]) * twoBit->maskBlockCount
00187 + sizeof(twoBit->maskSizes[0]) * twoBit->maskBlockCount
00188 + sizeof(twoBit->reserved);
00189 }
00190
00191 void twoBitWriteOne(struct twoBit *twoBit, FILE *f)
00192
00193
00194
00195 {
00196 writeOne(f, twoBit->size);
00197 writeOne(f, twoBit->nBlockCount);
00198 if (twoBit->nBlockCount > 0)
00199 {
00200 fwrite(twoBit->nStarts, sizeof(twoBit->nStarts[0]),
00201 twoBit->nBlockCount, f);
00202 fwrite(twoBit->nSizes, sizeof(twoBit->nSizes[0]),
00203 twoBit->nBlockCount, f);
00204 }
00205 writeOne(f, twoBit->maskBlockCount);
00206 if (twoBit->maskBlockCount > 0)
00207 {
00208 fwrite(twoBit->maskStarts, sizeof(twoBit->maskStarts[0]),
00209 twoBit->maskBlockCount, f);
00210 fwrite(twoBit->maskSizes, sizeof(twoBit->maskSizes[0]),
00211 twoBit->maskBlockCount, f);
00212 }
00213 writeOne(f, twoBit->reserved);
00214 mustWrite(f, twoBit->data, packedSize(twoBit->size));
00215 }
00216
00217 void twoBitWriteHeader(struct twoBit *twoBitList, FILE *f)
00218
00219
00220 {
00221 bits32 sig = twoBitSig;
00222 bits32 version = 0;
00223 bits32 seqCount = slCount(twoBitList);
00224 bits32 reserved = 0;
00225 bits32 offset = 0;
00226 struct twoBit *twoBit;
00227 long long counter = 0;
00228
00229
00230 writeOne(f, sig);
00231 writeOne(f, version);
00232 writeOne(f, seqCount);
00233 writeOne(f, reserved);
00234
00235
00236
00237
00238 offset = sizeof(sig) + sizeof(version) + sizeof(seqCount) + sizeof(reserved);
00239 for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next)
00240 {
00241 int nameLen = strlen(twoBit->name);
00242 if (nameLen > 255)
00243 errAbort("name %s too long", twoBit->name);
00244 offset += nameLen + 1 + sizeof(bits32);
00245 }
00246
00247
00248 for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next)
00249 {
00250 int size = twoBitSizeInFile(twoBit);
00251 writeString(f, twoBit->name);
00252 writeOne(f, offset);
00253 offset += size;
00254 counter += (long long)size;
00255 if (counter > UINT_MAX )
00256 errAbort("Error in faToTwoBit, index overflow at %s. The 2bit format "
00257 "does not support indexes larger than %dGb, \n"
00258 "please split up into smaller files.\n",
00259 twoBit->name, UINT_MAX/1000000000);
00260 }
00261 }
00262
00263 void twoBitClose(struct twoBitFile **pTbf)
00264
00265 {
00266 struct twoBitFile *tbf = *pTbf;
00267 if (tbf != NULL)
00268 {
00269 freez(&tbf->fileName);
00270 carefulClose(&tbf->f);
00271 hashFree(&tbf->hash);
00272
00273 freez(pTbf);
00274 }
00275 }
00276
00277 static bits32 readBits32(FILE *f, boolean isSwapped)
00278
00279 {
00280 bits32 val;
00281 mustReadOne(f, val);
00282 if (isSwapped)
00283 val = byteSwap32(val);
00284 return val;
00285 }
00286
00287 struct twoBitFile *twoBitOpen(char *fileName)
00288
00289
00290 {
00291 bits32 sig;
00292 struct twoBitFile *tbf;
00293 struct twoBitIndex *index;
00294 boolean isSwapped = FALSE;
00295 int i;
00296 struct hash *hash;
00297 FILE *f = mustOpen(fileName, "rb");
00298
00299
00300
00301 AllocVar(tbf);
00302 mustReadOne(f, sig);
00303 if (sig == twoBitSwapSig)
00304 isSwapped = tbf->isSwapped = TRUE;
00305 else if (sig != twoBitSig)
00306 errAbort("%s doesn't have a valid twoBitSig", fileName);
00307 tbf->fileName = cloneString(fileName);
00308 tbf->f = f;
00309 tbf->version = readBits32(f, isSwapped);
00310 if (tbf->version != 0)
00311 {
00312 errAbort("Can only handle version 0 of this file. This is version %d",
00313 (int)tbf->version);
00314 }
00315 tbf->seqCount = readBits32(f, isSwapped);
00316 tbf->reserved = readBits32(f, isSwapped);
00317
00318
00319 hash = tbf->hash = hashNew(digitsBaseTwo(tbf->seqCount));
00320 for (i=0; i<tbf->seqCount; ++i)
00321 {
00322 char name[256];
00323 if (!fastReadString(f, name))
00324 errAbort("%s is truncated", fileName);
00325 lmAllocVar(hash->lm, index);
00326 index->offset = readBits32(f, isSwapped);
00327 hashAddSaveName(hash, name, index, &index->name);
00328 slAddHead(&tbf->indexList, index);
00329 }
00330 slReverse(&tbf->indexList);
00331 return tbf;
00332 }
00333
00334
00335 static int findGreatestLowerBound(int blockCount, bits32 *pos,
00336 int val)
00337
00338
00339 {
00340 int startIx=0, endIx=blockCount-1, midIx;
00341 int posVal;
00342
00343 for (;;)
00344 {
00345 if (startIx == endIx)
00346 {
00347 posVal = pos[startIx];
00348 if (posVal <= val || startIx == 0)
00349 return startIx;
00350 else
00351 return startIx-1;
00352 }
00353 midIx = ((startIx + endIx)>>1);
00354 posVal = pos[midIx];
00355 if (posVal < val)
00356 startIx = midIx+1;
00357 else
00358 endIx = midIx;
00359 }
00360 }
00361
00362 static void twoBitSeekTo(struct twoBitFile *tbf, char *name)
00363
00364 {
00365 struct twoBitIndex *index = hashFindVal(tbf->hash, name);
00366 if (index == NULL)
00367 errAbort("%s is not in %s", name, tbf->fileName);
00368 fseek(tbf->f, index->offset, SEEK_SET);
00369 }
00370
00371 static void readBlockCoords(FILE *f, boolean isSwapped, bits32 *retBlockCount,
00372 bits32 **retBlockStarts, bits32 **retBlockSizes)
00373
00374
00375 {
00376 bits32 blkCount = readBits32(f, isSwapped);
00377 *retBlockCount = blkCount;
00378 if (blkCount == 0)
00379 {
00380 *retBlockStarts = NULL;
00381 *retBlockSizes = NULL;
00382 }
00383 else
00384 {
00385 bits32 *nStarts, *nSizes;
00386 AllocArray(nStarts, blkCount);
00387 AllocArray(nSizes, blkCount);
00388 fread(nStarts, sizeof(nStarts[0]), blkCount, f);
00389 fread(nSizes, sizeof(nSizes[0]), blkCount, f);
00390 if (isSwapped)
00391 {
00392 int i;
00393 for (i=0; i<blkCount; ++i)
00394 {
00395 nStarts[i] = byteSwap32(nStarts[i]);
00396 nSizes[i] = byteSwap32(nSizes[i]);
00397 }
00398 }
00399 *retBlockStarts = nStarts;
00400 *retBlockSizes = nSizes;
00401 }
00402 }
00403
00404 struct twoBit *twoBitFromFile(char *fileName)
00405
00406 {
00407 struct twoBitFile *tbf = twoBitOpen(fileName);
00408 struct twoBitIndex *index;
00409 struct twoBit *twoBitList = NULL, *twoBit = NULL;
00410 boolean isSwapped = tbf->isSwapped;
00411 FILE *f = tbf->f;
00412
00413 for (index = tbf->indexList; index != NULL; index = index->next)
00414 {
00415 char *name = index->name;
00416 bits32 packByteCount;
00417
00418 AllocVar(twoBit);
00419 twoBit->name = cloneString(name);
00420
00421
00422 twoBitSeekTo(tbf, name);
00423
00424
00425 twoBit->size = readBits32(f, isSwapped);
00426
00427
00428 readBlockCoords(f, isSwapped, &(twoBit->nBlockCount),
00429 &(twoBit->nStarts), &(twoBit->nSizes));
00430
00431
00432 readBlockCoords(f, isSwapped, &(twoBit->maskBlockCount),
00433 &(twoBit->maskStarts), &(twoBit->maskSizes));
00434
00435
00436 twoBit->reserved = readBits32(f, isSwapped);
00437
00438
00439 packByteCount = packedSize(twoBit->size);
00440 twoBit->data = needLargeMem(packByteCount);
00441 mustRead(f, twoBit->data, packByteCount);
00442
00443 slAddHead(&twoBitList, twoBit);
00444 }
00445
00446 twoBitClose(&tbf);
00447 slReverse(&twoBitList);
00448 return twoBitList;
00449 }
00450
00451 struct dnaSeq *twoBitReadSeqFragExt(struct twoBitFile *tbf, char *name,
00452 int fragStart, int fragEnd, boolean doMask, int *retFullSize)
00453
00454
00455
00456
00457 {
00458 struct dnaSeq *seq;
00459 bits32 seqSize;
00460 bits32 nBlockCount, maskBlockCount;
00461 bits32 *nStarts = NULL, *nSizes = NULL;
00462 bits32 *maskStarts = NULL, *maskSizes = NULL;
00463 boolean isSwapped = tbf->isSwapped;
00464 FILE *f = tbf->f;
00465 int i;
00466 int packByteCount, packedStart, packedEnd, remainder, midStart, midEnd;
00467 int outSize;
00468 UBYTE *packed, *packedAlloc;
00469 DNA *dna;
00470
00471
00472 dnaUtilOpen();
00473 twoBitSeekTo(tbf, name);
00474
00475
00476 seqSize = readBits32(f, isSwapped);
00477 if (fragEnd == 0)
00478 fragEnd = seqSize;
00479 if (fragEnd > seqSize)
00480 errAbort("twoBitReadSeqFrag in %s end (%d) >= seqSize (%d)", name, fragEnd, seqSize);
00481 outSize = fragEnd - fragStart;
00482 if (outSize < 1)
00483 errAbort("twoBitReadSeqFrag in %s start (%d) >= end (%d)", name, fragStart, fragEnd);
00484
00485
00486 readBlockCoords(f, isSwapped, &nBlockCount, &nStarts, &nSizes);
00487
00488
00489 readBlockCoords(f, isSwapped, &maskBlockCount, &maskStarts, &maskSizes);
00490
00491
00492 readBits32(f, isSwapped);
00493
00494
00495 AllocVar(seq);
00496 if (outSize == seqSize)
00497 seq->name = cloneString(name);
00498 else
00499 {
00500 char buf[256*2];
00501 safef(buf, sizeof(buf), "%s:%d-%d", name, fragStart, fragEnd);
00502 seq->name = cloneString(buf);
00503 }
00504 seq->size = outSize;
00505 dna = seq->dna = needLargeMem(outSize+1);
00506 seq->dna[outSize] = 0;
00507
00508
00509
00510 packedStart = (fragStart>>2);
00511 packedEnd = ((fragEnd+3)>>2);
00512 packByteCount = packedEnd - packedStart;
00513 packed = packedAlloc = needLargeMem(packByteCount);
00514 fseek(f, packedStart, SEEK_CUR);
00515 mustRead(f, packed, packByteCount);
00516
00517
00518 if (packByteCount == 1)
00519 {
00520 int pOff = (packedStart<<2);
00521 int pStart = fragStart - pOff;
00522 int pEnd = fragEnd - pOff;
00523 UBYTE partial = *packed;
00524 assert(pEnd <= 4);
00525 assert(pStart >= 0);
00526 for (i=pStart; i<pEnd; ++i)
00527 *dna++ = valToNt[(partial >> (6-i-i)) & 3];
00528 }
00529 else
00530 {
00531
00532 midStart = fragStart;
00533 remainder = (fragStart&3);
00534 if (remainder > 0)
00535 {
00536 UBYTE partial = *packed++;
00537 int partCount = 4 - remainder;
00538 for (i=partCount-1; i>=0; --i)
00539 {
00540 dna[i] = valToNt[partial&3];
00541 partial >>= 2;
00542 }
00543 midStart += partCount;
00544 dna += partCount;
00545 }
00546
00547
00548 remainder = fragEnd&3;
00549 midEnd = fragEnd - remainder;
00550 for (i=midStart; i<midEnd; i += 4)
00551 {
00552 UBYTE b = *packed++;
00553 dna[3] = valToNt[b&3];
00554 b >>= 2;
00555 dna[2] = valToNt[b&3];
00556 b >>= 2;
00557 dna[1] = valToNt[b&3];
00558 b >>= 2;
00559 dna[0] = valToNt[b&3];
00560 dna += 4;
00561 }
00562
00563 if (remainder >0)
00564 {
00565 UBYTE part = *packed;
00566 part >>= (8-remainder-remainder);
00567 for (i=remainder-1; i>=0; --i)
00568 {
00569 dna[i] = valToNt[part&3];
00570 part >>= 2;
00571 }
00572 }
00573 }
00574 freez(&packedAlloc);
00575
00576 if (nBlockCount > 0)
00577 {
00578 int startIx = findGreatestLowerBound(nBlockCount, nStarts, fragStart);
00579 for (i=startIx; i<nBlockCount; ++i)
00580 {
00581 int s = nStarts[i];
00582 int e = s + nSizes[i];
00583 if (s >= fragEnd)
00584 break;
00585 if (s < fragStart)
00586 s = fragStart;
00587 if (e > fragEnd)
00588 e = fragEnd;
00589 if (s < e)
00590 memset(seq->dna + s - fragStart, 'n', e - s);
00591 }
00592 }
00593
00594 if (doMask)
00595 {
00596 toUpperN(seq->dna, seq->size);
00597 if (maskBlockCount > 0)
00598 {
00599 int startIx = findGreatestLowerBound(maskBlockCount, maskStarts,
00600 fragStart);
00601 for (i=startIx; i<maskBlockCount; ++i)
00602 {
00603 int s = maskStarts[i];
00604 int e = s + maskSizes[i];
00605 if (s >= fragEnd)
00606 break;
00607 if (s < fragStart)
00608 s = fragStart;
00609 if (e > fragEnd)
00610 e = fragEnd;
00611 if (s < e)
00612 toLowerN(seq->dna + s - fragStart, e - s);
00613 }
00614 }
00615 }
00616 freez(&nStarts);
00617 freez(&nSizes);
00618 freez(&maskStarts);
00619 freez(&maskSizes);
00620 if (retFullSize != NULL)
00621 *retFullSize = seqSize;
00622 return seq;
00623 }
00624
00625 struct dnaSeq *twoBitReadSeqFrag(struct twoBitFile *tbf, char *name,
00626 int fragStart, int fragEnd)
00627
00628
00629
00630
00631 {
00632 return twoBitReadSeqFragExt(tbf, name, fragStart, fragEnd, TRUE, NULL);
00633 }
00634
00635 struct dnaSeq *twoBitReadSeqFragLower(struct twoBitFile *tbf, char *name,
00636 int fragStart, int fragEnd)
00637
00638 {
00639 return twoBitReadSeqFragExt(tbf, name, fragStart, fragEnd, FALSE, NULL);
00640 }
00641
00642 int twoBitSeqSize(struct twoBitFile *tbf, char *name)
00643
00644 {
00645 twoBitSeekTo(tbf, name);
00646 return readBits32(tbf->f, tbf->isSwapped);
00647 }
00648
00649 long long twoBitTotalSize(struct twoBitFile *tbf)
00650
00651 {
00652 struct twoBitIndex *index;
00653 long long totalSize = 0;
00654 for (index = tbf->indexList; index != NULL; index = index->next)
00655 {
00656 fseek(tbf->f, index->offset, SEEK_SET);
00657 totalSize += readBits32(tbf->f, tbf->isSwapped);
00658 }
00659 return totalSize;
00660 }
00661
00662 struct dnaSeq *twoBitLoadAll(char *spec)
00663
00664
00665
00666
00667
00668
00669
00670
00671
00672 {
00673 struct twoBitSpec *tbs = twoBitSpecNew(spec);
00674 struct twoBitFile *tbf = twoBitOpen(tbs->fileName);
00675 struct dnaSeq *list = NULL;
00676 if (tbs->seqs != NULL)
00677 {
00678 struct twoBitSeqSpec *tbss;
00679 for (tbss = tbs->seqs; tbss != NULL; tbss = tbss->next)
00680 slSafeAddHead(&list, twoBitReadSeqFrag(tbf, tbss->name,
00681 tbss->start, tbss->end));
00682 }
00683 else
00684 {
00685 struct twoBitIndex *index;
00686 for (index = tbf->indexList; index != NULL; index = index->next)
00687 slSafeAddHead(&list, twoBitReadSeqFrag(tbf, index->name, 0, 0));
00688 }
00689 slReverse(&list);
00690 twoBitClose(&tbf);
00691 twoBitSpecFree(&tbs);
00692 return list;
00693 }
00694
00695 struct slName *twoBitSeqNames(char *fileName)
00696
00697 {
00698 struct twoBitFile *tbf = twoBitOpen(fileName);
00699 struct twoBitIndex *index;
00700 struct slName *name, *list = NULL;
00701 for (index = tbf->indexList; index != NULL; index = index->next)
00702 {
00703 name = slNameNew(index->name);
00704 slAddHead(&list, name);
00705 }
00706 twoBitClose(&tbf);
00707 slReverse(&list);
00708 return list;
00709 }
00710
00711 boolean twoBitIsFile(char *fileName)
00712
00713 {
00714 return endsWith(fileName, ".2bit");
00715 }
00716
00717 boolean twoBitParseRange(char *rangeSpec, char **retFile,
00718 char **retSeq, int *retStart, int *retEnd)
00719
00720
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731 {
00732 char *s, *e;
00733 int n;
00734
00735
00736 if (retFile != NULL)
00737 *retFile = rangeSpec;
00738 if (retSeq != NULL)
00739 *retSeq = NULL;
00740 if (retStart != NULL)
00741 *retStart = 0;
00742 if (retEnd != NULL)
00743 *retEnd = 0;
00744
00745
00746 s = strrchr(rangeSpec, '/');
00747 if (s == NULL)
00748 s = rangeSpec;
00749 else
00750 s++;
00751
00752
00753 s = strchr(s, ':');
00754 if (s == NULL)
00755 return FALSE;
00756 *s++ = 0;
00757 if (retSeq != NULL)
00758 *retSeq = s;
00759
00760
00761 s = strchr(s, ':');
00762 if (s == NULL)
00763 return TRUE;
00764 *s++ = 0;
00765 n = strtol(s, &e, 0);
00766 if (*e != '-')
00767 return FALSE;
00768 if (retStart != NULL)
00769 *retStart = n;
00770 s = e+1;
00771
00772
00773 n = strtol(s, &e, 0);
00774 if (*e != '\0')
00775 return FALSE;
00776 if (retEnd != NULL)
00777 *retEnd = n;
00778 return TRUE;
00779 }
00780
00781 boolean twoBitIsRange(char *rangeSpec)
00782
00783 {
00784 char *dupe = cloneString(rangeSpec);
00785 char *file, *seq;
00786 int start, end;
00787 boolean isRange = twoBitParseRange(dupe, &file, &seq, &start, &end);
00788 if (isRange)
00789 isRange = twoBitIsFile(file);
00790 freeMem(dupe);
00791 return isRange;
00792 }
00793
00794 boolean twoBitIsFileOrRange(char *spec)
00795
00796 {
00797 return twoBitIsFile(spec) || twoBitIsRange(spec);
00798 }
00799
00800 static struct twoBitSeqSpec *parseSeqSpec(char *seqSpecStr)
00801
00802 {
00803 boolean isOk = TRUE;
00804 char *s, *e;
00805 struct twoBitSeqSpec *seq;
00806 AllocVar(seq);
00807 seq->name = cloneString(seqSpecStr);
00808
00809
00810 s = strchr(seq->name, ':');
00811 if (s == NULL)
00812 return seq;
00813 *s++ = 0;
00814 seq->start = strtol(s, &e, 0);
00815 if (*e != '-')
00816 isOk = FALSE;
00817 else
00818 {
00819
00820 s = e+1;
00821 seq->end = strtol(s, &e, 0);
00822 if (*e != '\0')
00823 isOk = FALSE;
00824 }
00825 if (!isOk || (seq->end < seq->start))
00826 errAbort("invalid twoBit sequence specification: \"%s\"", seqSpecStr);
00827 return seq;
00828 }
00829
00830 boolean twoBitIsSpec(char *spec)
00831
00832 {
00833 struct twoBitSpec *tbs = twoBitSpecNew(spec);
00834 boolean isSpec = (tbs != NULL);
00835 twoBitSpecFree(&tbs);
00836 return isSpec;
00837 }
00838
00839 struct twoBitSpec *twoBitSpecNew(char *specStr)
00840
00841
00842
00843
00844
00845
00846
00847
00848
00849
00850
00851
00852 {
00853 char *s, *e;
00854 int i, numSeqs;
00855 char **seqSpecs;
00856 struct twoBitSpec *spec;
00857 AllocVar(spec);
00858 spec->fileName = cloneString(specStr);
00859
00860
00861 s = strrchr(spec->fileName, '/');
00862 if (s == NULL)
00863 s = spec->fileName;
00864 else
00865 s++;
00866
00867
00868 e = strchr(s, ':');
00869 if (e == NULL)
00870 s = NULL;
00871 else
00872 {
00873 *e++ = '\0';
00874 s = e;
00875 }
00876 if (!endsWith(spec->fileName, ".2bit"))
00877 {
00878 twoBitSpecFree(&spec);
00879 return NULL;
00880 }
00881
00882 if (s != NULL)
00883 {
00884
00885 numSeqs = chopString(s, ",", NULL, 0);
00886 AllocArray(seqSpecs, numSeqs);
00887 chopString(s, ",", seqSpecs, numSeqs);
00888 for (i = 0; i< numSeqs; i++)
00889 slSafeAddHead(&spec->seqs, parseSeqSpec(seqSpecs[i]));
00890 slReverse(&spec->seqs);
00891 }
00892 return spec;
00893 }
00894
00895 struct twoBitSpec *twoBitSpecNewFile(char *twoBitFile, char *specFile)
00896
00897
00898
00899
00900
00901
00902 {
00903 struct lineFile *lf = lineFileOpen(specFile, TRUE);
00904 char *line;
00905 struct twoBitSpec *spec;
00906 AllocVar(spec);
00907 spec->fileName = cloneString(twoBitFile);
00908 while (lineFileNextReal(lf, &line))
00909 slSafeAddHead(&spec->seqs, parseSeqSpec(trimSpaces(line)));
00910 slReverse(&spec->seqs);
00911 lineFileClose(&lf);
00912 return spec;
00913 }
00914
00915 void twoBitSpecFree(struct twoBitSpec **specPtr)
00916
00917 {
00918 struct twoBitSpec *spec = *specPtr;
00919 if (spec != NULL)
00920 {
00921 struct twoBitSeqSpec *seq;
00922 while ((seq = slPopHead(&spec->seqs)) != NULL)
00923 {
00924 freeMem(seq->name);
00925 freeMem(seq);
00926 }
00927 freeMem(spec->fileName);
00928 freeMem(spec);
00929 *specPtr = NULL;
00930 }
00931 }
00932
00933 void twoBitOutNBeds(struct twoBitFile *tbf, char *seqName, FILE *outF)
00934
00935 {
00936 int nBlockCount;
00937
00938 twoBitSeekTo(tbf, seqName);
00939
00940 readBits32(tbf->f, tbf->isSwapped);
00941
00942
00943 nBlockCount = readBits32(tbf->f, tbf->isSwapped);
00944
00945 if (nBlockCount > 0)
00946 {
00947 bits32 *nStarts = NULL, *nSizes = NULL;
00948 int i;
00949
00950 AllocArray(nStarts, nBlockCount);
00951 AllocArray(nSizes, nBlockCount);
00952 fread(nStarts, sizeof(nStarts[0]), nBlockCount, tbf->f);
00953 fread(nSizes, sizeof(nSizes[0]), nBlockCount, tbf->f);
00954 if (tbf->isSwapped)
00955 {
00956 for (i=0; i<nBlockCount; ++i)
00957 {
00958 nStarts[i] = byteSwap32(nStarts[i]);
00959 nSizes[i] = byteSwap32(nSizes[i]);
00960 }
00961 }
00962
00963 for (i=0; i<nBlockCount; ++i)
00964 {
00965 fprintf(outF, "%s\t%d\t%d\n", seqName, nStarts[i], nStarts[i] + nSizes[i]);
00966 }
00967
00968 freez(&nStarts);
00969 freez(&nSizes);
00970 }
00971 }
00972
00973 int twoBitSeqSizeNoNs(struct twoBitFile *tbf, char *seqName)
00974
00975 {
00976 int nBlockCount;
00977 int size;
00978
00979 twoBitSeekTo(tbf, seqName);
00980
00981 size = readBits32(tbf->f, tbf->isSwapped);
00982
00983
00984 nBlockCount = readBits32(tbf->f, tbf->isSwapped);
00985
00986 if (nBlockCount > 0)
00987 {
00988 bits32 *nStarts = NULL, *nSizes = NULL;
00989
00990 int i;
00991
00992 AllocArray(nStarts, nBlockCount);
00993 AllocArray(nSizes, nBlockCount);
00994 fread(nStarts, sizeof(nStarts[0]), nBlockCount, tbf->f);
00995 fread(nSizes, sizeof(nSizes[0]), nBlockCount, tbf->f);
00996 if (tbf->isSwapped)
00997 {
00998 for (i=0; i<nBlockCount; ++i)
00999 {
01000 nStarts[i] = byteSwap32(nStarts[i]);
01001 nSizes[i] = byteSwap32(nSizes[i]);
01002 }
01003 }
01004
01005 for (i=0; i<nBlockCount; ++i)
01006 {
01007 size -= nSizes[i];
01008 }
01009
01010 freez(&nStarts);
01011 freez(&nSizes);
01012 }
01013
01014 return(size);
01015 }