00001
00002
00003
00004
00005
00006 #include "common.h"
00007 #include "hash.h"
00008 #include "dnautil.h"
00009 #include "dnaseq.h"
00010 #include "nib.h"
00011 #include "sig.h"
00012
00013 static char const rcsid[] = "$Id: nib.c,v 1.24 2007/03/13 20:43:05 kent Exp $";
00014
00015 static char *findNibSubrange(char *fileName)
00016
00017
00018 {
00019 char *baseName = strrchr(fileName, '/');
00020 baseName = (baseName == NULL) ? fileName : baseName+1;
00021 return strchr(baseName, ':');
00022 }
00023
00024 static void parseSubrange(char *subrange, char *name,
00025 unsigned *start, unsigned *end)
00026
00027 {
00028 char *rangePart = strchr(subrange+1, ':');
00029 if (rangePart != NULL)
00030 {
00031
00032 *rangePart = '\0';
00033 strcpy(name, subrange+1);
00034 *rangePart = ':';
00035 rangePart++;
00036 }
00037 else
00038 {
00039
00040 rangePart = subrange+1;
00041 strcpy(name, "");
00042 }
00043 if ((sscanf(rangePart, "%u-%u", start, end) != 2) || (*start > *end))
00044 errAbort("can't parse nib file subsequence specification: %s",
00045 subrange);
00046 }
00047
00048 void nibParseName(unsigned options, char *fileSpec, char *filePath,
00049 char *name, unsigned *start, unsigned *end)
00050
00051
00052
00053
00054 {
00055 char *subrange = findNibSubrange(fileSpec);
00056 if (subrange != NULL)
00057 {
00058 *subrange = '\0';
00059 parseSubrange(subrange, name, start, end);
00060 strcpy(filePath, fileSpec);
00061 *subrange = ':';
00062 if (strlen(name) == 0)
00063 {
00064
00065 if (options & NIB_BASE_NAME)
00066 splitPath(filePath, NULL, name, NULL);
00067 else
00068 strcpy(name, filePath);
00069 sprintf(name+strlen(name), ":%u-%u", *start, *end);
00070 }
00071 }
00072 else
00073 {
00074 *start = 0;
00075 *end = 0;
00076 strcpy(filePath, fileSpec);
00077 if (options & NIB_BASE_NAME)
00078 splitPath(fileSpec, NULL, name, NULL);
00079 else
00080 strcpy(name, fileSpec);
00081 }
00082 }
00083
00084 void nibOpenVerify(char *fileName, FILE **retFile, int *retSize)
00085
00086 {
00087 bits32 size;
00088 bits32 sig;
00089 FILE *f = fopen(fileName, "rb");
00090 char buffer[512];
00091 char buffer2[512];
00092 char buffer3[512];
00093
00094 if (f == NULL)
00095 {
00096
00097 char *ptr = NULL;
00098 char *dir, *file;
00099 struct stat statBuf;
00100
00101
00102 safef(buffer, sizeof(buffer), "%s", fileName);
00103 if ((ptr = strrchr(buffer, '/')) != NULL)
00104 {
00105 *ptr++ = 0;
00106 dir = buffer;
00107 file = ptr;
00108 }
00109 else
00110 {
00111 dir = "";
00112 file = buffer;
00113 }
00114
00115 buffer3[0] = 0;
00116
00117 for(ptr = &file[strlen(file) - 5]; ; )
00118 {
00119 strcpy(buffer2, buffer3);
00120 if (isdigit(*ptr))
00121 {
00122
00123 safef(buffer3, sizeof(buffer3), "%c/%s",*ptr,buffer2);
00124 ptr--;
00125 }
00126 else
00127
00128 safef(buffer3, sizeof(buffer3), "0/%s",buffer2);
00129
00130
00131 safef(buffer2, sizeof(buffer2), "%s/%s", dir, buffer3);
00132 if (stat(buffer2, &statBuf) < 0)
00133 break;
00134
00135
00136 safef(buffer2, sizeof(buffer2), "%s/%s/%s", dir, buffer3, file);
00137 if ((f = fopen(buffer2, "rb")) != NULL)
00138 break;
00139 }
00140 if (f == NULL)
00141 errAbort("Can't open %s to read: %s", fileName, strerror(errno));
00142 }
00143 dnaUtilOpen();
00144 mustReadOne(f, sig);
00145 mustReadOne(f, size);
00146 if (sig != nibSig)
00147 {
00148 sig = byteSwap32(sig);
00149 size = byteSwap32(size);
00150 if (sig != nibSig)
00151 errAbort("%s is not a good .nib file.", fileName);
00152 }
00153 *retSize = size;
00154 *retFile = f;
00155 }
00156
00157 static struct dnaSeq *nibInput(int options, char *fileName, char *seqName,
00158 FILE *f, int seqSize, int start, int size)
00159
00160 {
00161 int end;
00162 DNA *d;
00163 int bVal;
00164 DNA *valToNtTbl = ((options & NIB_MASK_MIXED) ? valToNtMasked : valToNt);
00165 struct dnaSeq *seq;
00166 Bits* mask = NULL;
00167 int bytePos, byteSize;
00168 int maskIdx = 0;
00169
00170 assert(start >= 0);
00171 assert(size >= 0);
00172
00173 end = start+size;
00174 if (end > seqSize)
00175 errAbort("nib read past end of file (%d %d) in file: %s",
00176 end, seqSize, (fileName != NULL ? fileName : "(NULL)"));
00177
00178 AllocVar(seq);
00179 seq->size = size;
00180 seq->name = cloneString(seqName);
00181 seq->dna = d = needLargeMem(size+1);
00182 if (options & NIB_MASK_MIXED)
00183 seq->mask = mask = bitAlloc(size);
00184
00185 bytePos = (start>>1);
00186 fseek(f, bytePos + 2*sizeof(bits32), SEEK_SET);
00187 if (start & 1)
00188 {
00189 bVal = getc_unlocked(f);
00190 if (bVal < 0)
00191 {
00192 errAbort("Read error 1 in %s", fileName);
00193 }
00194 *d++ = valToNtTbl[(bVal&0xf)];
00195 size -= 1;
00196 if (mask != NULL)
00197 {
00198 if ((bVal&0xf&MASKED_BASE_BIT) == 0)
00199 bitSetOne(mask, maskIdx);
00200 maskIdx++;
00201 }
00202 }
00203 byteSize = (size>>1);
00204 while (--byteSize >= 0)
00205 {
00206 bVal = getc_unlocked(f);
00207 if (bVal < 0)
00208 errAbort("Read error 2 in %s", fileName);
00209 d[0] = valToNtTbl[(bVal>>4)];
00210 d[1] = valToNtTbl[(bVal&0xf)];
00211 d += 2;
00212 if (mask != NULL)
00213 {
00214 if (((bVal>>4)&0xf) == 0)
00215 bitSetOne(mask, maskIdx);
00216 if ((bVal&0xf) == 0)
00217 bitSetOne(mask, maskIdx+1);
00218 maskIdx += 2;
00219 }
00220 }
00221 if (size&1)
00222 {
00223 bVal = getc_unlocked(f);
00224 if (bVal < 0)
00225 errAbort("Read error 3 in %s", fileName);
00226 *d++ = valToNtTbl[(bVal>>4)];
00227 if (mask != NULL)
00228 {
00229 if ((bVal>>4) == 0)
00230 bitSetOne(mask, maskIdx);
00231 maskIdx++;
00232 }
00233 }
00234 *d = 0;
00235 return seq;
00236 }
00237
00238 static void nibOutput(int options, struct dnaSeq *seq, char *fileName)
00239
00240
00241 {
00242 UBYTE byte;
00243 DNA *dna = seq->dna;
00244 int dVal1, dVal2;
00245 bits32 size = seq->size;
00246 int byteCount = (size>>1);
00247 bits32 sig = nibSig;
00248 int *ntValTbl = ((options & NIB_MASK_MIXED) ? ntValMasked : ntVal5);
00249 Bits* mask = ((options & NIB_MASK_MAP) ? seq->mask : NULL);
00250 int maskIdx = 0;
00251 FILE *f = mustOpen(fileName, "w");
00252
00253 assert(sizeof(bits32) == 4);
00254
00255 writeOne(f, sig);
00256 writeOne(f, seq->size);
00257
00258 printf("Writing %d bases in %d bytes\n", seq->size, ((seq->size+1)/2) + 8);
00259 while (--byteCount >= 0)
00260 {
00261 dVal1 = ntValTbl[(int)dna[0]];
00262 dVal2 = ntValTbl[(int)dna[1]];
00263
00264
00265 if (mask != NULL)
00266 {
00267 if (!bitReadOne(mask, maskIdx))
00268 dVal1 |= MASKED_BASE_BIT;
00269 if (!bitReadOne(mask, maskIdx+1))
00270 dVal2 |= MASKED_BASE_BIT;
00271 maskIdx += 2;
00272 }
00273 byte = (dVal1<<4) | dVal2;
00274 if (putc(byte, f) < 0)
00275 {
00276 perror("");
00277 errAbort("Couldn't write all of %s", fileName);
00278 }
00279 dna += 2;
00280 }
00281 if (size & 1)
00282 {
00283 dVal1 = ntValTbl[(int)dna[0]];
00284 if ((mask != NULL) && !bitReadOne(mask, maskIdx))
00285 dVal1 |= MASKED_BASE_BIT;
00286 byte = (dVal1<<4);
00287 putc(byte, f);
00288 }
00289 carefulClose(&f);
00290 }
00291
00292 struct dnaSeq *nibLdPartMasked(int options, char *fileName, FILE *f, int seqSize, int start, int size)
00293
00294
00295 {
00296 char nameBuf[512];
00297 safef(nameBuf, sizeof(nameBuf), "%s:%d-%d", fileName, start, start+size);
00298 return nibInput(options, fileName, nameBuf, f, seqSize, start, size);
00299 }
00300
00301 struct dnaSeq *nibLdPart(char *fileName, FILE *f, int seqSize, int start, int size)
00302
00303 {
00304 return nibLdPartMasked(0, fileName, f, seqSize, start, size);
00305 }
00306
00307 struct dnaSeq *nibLoadPartMasked(int options, char *fileName, int start, int size)
00308
00309 {
00310 struct dnaSeq *seq;
00311 FILE *f;
00312 int seqSize;
00313 nibOpenVerify(fileName, &f, &seqSize);
00314 seq = nibLdPartMasked(options, fileName, f, seqSize, start, size);
00315 fclose(f);
00316 return seq;
00317 }
00318
00319 struct dnaSeq *nibLoadPart(char *fileName, int start, int size)
00320
00321 {
00322 return nibLoadPartMasked(0, fileName, start, size);
00323 }
00324
00325 struct dnaSeq *nibLoadAllMasked(int options, char *fileName)
00326
00327
00328
00329
00330
00331
00332
00333
00334
00335 {
00336 struct dnaSeq *seq;
00337 FILE *f;
00338 int seqSize;
00339 char filePath[PATH_LEN];
00340 char name[PATH_LEN];
00341 unsigned start, end;
00342
00343 nibParseName(options, fileName, filePath, name, &start, &end);
00344 nibOpenVerify(filePath, &f, &seqSize);
00345 if (end == 0)
00346 end = seqSize;
00347 seq = nibInput(options, fileName, name, f, seqSize, start, end-start);
00348 fclose(f);
00349 return seq;
00350 }
00351
00352 struct dnaSeq *nibLoadAll(char *fileName)
00353
00354 {
00355 return nibLoadAllMasked(0, fileName);
00356 }
00357
00358 void nibWriteMasked(int options, struct dnaSeq *seq, char *fileName)
00359
00360
00361 {
00362 nibOutput(options, seq, fileName);
00363 }
00364
00365 void nibWrite(struct dnaSeq *seq, char *fileName)
00366
00367 {
00368 nibWriteMasked(0, seq, fileName);
00369 }
00370
00371 struct nibStream
00372
00373
00374
00375 {
00376 struct nibStream *next;
00377 char *fileName;
00378 FILE *f;
00379 bits32 size;
00380 UBYTE byte;
00381 };
00382
00383 struct nibStream *nibStreamOpen(char *fileName)
00384
00385 {
00386 struct nibStream *ns;
00387 FILE *f;
00388
00389 dnaUtilOpen();
00390 AllocVar(ns);
00391 ns->f = f = mustOpen(fileName, "wb");
00392 ns->fileName = cloneString(fileName);
00393
00394
00395 writeOne(f, ns->size);
00396 writeOne(f, ns->size);
00397
00398 return ns;
00399 }
00400
00401 void nibStreamClose(struct nibStream **pNs)
00402
00403 {
00404 struct nibStream *ns = *pNs;
00405 FILE *f;
00406 bits32 sig = nibSig;
00407 if (ns == NULL)
00408 return;
00409 f = ns->f;
00410 if (ns->size&1)
00411 writeOne(f, ns->byte);
00412 fseek(f, 0L, SEEK_SET);
00413 writeOne(f, sig);
00414 writeOne(f, ns->size);
00415 fclose(f);
00416 freeMem(ns->fileName);
00417 freez(pNs);
00418 }
00419
00420 void nibStreamOne(struct nibStream *ns, DNA base)
00421
00422 {
00423 UBYTE ub = ntVal5[(int)base];
00424
00425 if ((++ns->size&1) == 0)
00426 {
00427 ub += ns->byte;
00428 writeOne(ns->f, ub);
00429 }
00430 else
00431 {
00432 ns->byte = (ub<<4);
00433 }
00434 }
00435
00436 void nibStreamMany(struct nibStream *ns, DNA *dna, int size)
00437
00438 {
00439 int i;
00440 for (i=0; i<size; ++i)
00441 nibStreamOne(ns, *dna++);
00442 }
00443
00444 boolean nibIsFile(char *fileName)
00445
00446 {
00447 boolean isANib;
00448 char *subrange = findNibSubrange(fileName);
00449 if (subrange != NULL)
00450 *subrange = '\0';
00451 isANib = endsWith(fileName, ".nib") || endsWith(fileName, ".NIB");
00452 if (subrange != NULL)
00453 *subrange = ':';
00454 return isANib;
00455 }
00456
00457 boolean nibIsRange(char *fileName)
00458
00459 {
00460 boolean isANib;
00461 char *subrange = findNibSubrange(fileName);;
00462 if (subrange == NULL)
00463 return FALSE;
00464 *subrange = '\0';
00465 isANib = endsWith(fileName, ".nib") || endsWith(fileName, ".NIB");
00466 *subrange = ':';
00467 return isANib;
00468 }
00469
00470 struct nibInfo *nibInfoNew(char *path)
00471
00472 {
00473 struct nibInfo *nib;
00474 AllocVar(nib);
00475 nib->fileName = cloneString(path);
00476 nibOpenVerify(path, &nib->f, &nib->size);
00477 return nib;
00478 }
00479
00480 void nibInfoFree(struct nibInfo **pNib)
00481
00482 {
00483 struct nibInfo *nib = *pNib;
00484 if (nib != NULL)
00485 {
00486 carefulClose(&nib->f);
00487 freeMem(nib->fileName);
00488 freez(pNib);
00489 }
00490 }
00491
00492 struct nibInfo *nibInfoFromCache(struct hash *hash, char *nibDir, char *nibName)
00493
00494 {
00495 struct nibInfo *nib;
00496 char path[PATH_LEN];
00497 safef(path, sizeof(path), "%s/%s.nib", nibDir, nibName);
00498 nib = hashFindVal(hash, path);
00499 if (nib == NULL)
00500 {
00501 nib = nibInfoNew(path);
00502 hashAdd(hash, path, nib);
00503 }
00504 return nib;
00505 }
00506
00507 int nibGetSize(char* nibFile)
00508
00509 {
00510 FILE* fh;
00511 int size;
00512
00513 nibOpenVerify(nibFile, &fh, &size);
00514 carefulClose(&fh);
00515 return size;
00516 }
00517