lib/nib.c

Go to the documentation of this file.
00001 /* Nib - nibble (4 bit) representation of nucleotide sequences. 
00002  *
00003  * This file is copyright 2002 Jim Kent, but license is hereby
00004  * granted for all use - public, private or commercial. */
00005 
00006 #include "common.h"
00007 #include "hash.h"
00008 #include "dnautil.h"
00009 #include "dnaseq.h"
00010 #include "nib.h"
00011 #include "sig.h"
00012 
00013 static char const rcsid[] = "$Id: nib.c,v 1.24 2007/03/13 20:43:05 kent Exp $";
00014 
00015 static char *findNibSubrange(char *fileName)
00016 /* find the colon starting a nib seq name/subrange in a nib file name, or NULL
00017  * if none */
00018 {
00019 char *baseName = strrchr(fileName, '/');
00020 baseName = (baseName == NULL) ? fileName : baseName+1;
00021 return strchr(baseName, ':');
00022 }
00023 
00024 static void parseSubrange(char *subrange, char *name, 
00025         unsigned *start, unsigned *end)
00026 /* parse the subrange specification */
00027 {
00028 char *rangePart = strchr(subrange+1, ':');
00029 if (rangePart != NULL)
00030     {
00031     /* :seqId:start-end form */
00032     *rangePart = '\0';
00033     strcpy(name, subrange+1);
00034     *rangePart = ':';
00035     rangePart++;
00036     }
00037 else
00038     {
00039     /* :start-end form */
00040     rangePart = subrange+1;
00041     strcpy(name, ""); 
00042     }
00043 if ((sscanf(rangePart, "%u-%u", start, end) != 2) || (*start > *end))
00044     errAbort("can't parse nib file subsequence specification: %s",
00045              subrange);
00046 }
00047 
00048 void nibParseName(unsigned options, char *fileSpec, char *filePath,
00049                          char *name, unsigned *start, unsigned *end)
00050 /* Parse the nib name, getting the file name, seq name to use, and
00051  * optionally the start and end positions. Zero is return for start
00052  * and end if they are not specified. Return the path to the file
00053  * and the name to use for the sequence. */
00054 {
00055 char *subrange = findNibSubrange(fileSpec);
00056 if (subrange != NULL)
00057     {
00058     *subrange = '\0';
00059     parseSubrange(subrange, name, start, end);
00060     strcpy(filePath, fileSpec);
00061     *subrange = ':';
00062     if (strlen(name) == 0)
00063         {
00064         /* no name in spec, generate one */
00065         if (options & NIB_BASE_NAME)
00066             splitPath(filePath, NULL, name, NULL);
00067         else
00068             strcpy(name, filePath);
00069         sprintf(name+strlen(name), ":%u-%u", *start, *end);
00070         }
00071     }
00072 else
00073     {
00074     *start = 0;
00075     *end = 0;
00076     strcpy(filePath, fileSpec);
00077     if (options & NIB_BASE_NAME)
00078         splitPath(fileSpec, NULL, name, NULL);
00079     else
00080         strcpy(name, fileSpec);
00081     }
00082 }
00083 
00084 void nibOpenVerify(char *fileName, FILE **retFile, int *retSize)
00085 /* Open file and verify it's in good nibble format. */
00086 {
00087 bits32 size;
00088 bits32 sig;
00089 FILE *f = fopen(fileName, "rb");
00090 char buffer[512];
00091 char buffer2[512];
00092 char buffer3[512];
00093 
00094 if (f == NULL)
00095     {
00096     /* see if nib is down a few directories ala faSplit -outDirDepth */
00097     char *ptr = NULL;
00098     char *dir, *file;
00099     struct stat statBuf;
00100 
00101     /* divide fileName into file and directory components */
00102     safef(buffer, sizeof(buffer), "%s", fileName);
00103     if ((ptr = strrchr(buffer, '/')) != NULL)
00104         {
00105         *ptr++ = 0;
00106         dir = buffer;
00107         file = ptr;
00108         }
00109     else
00110         {
00111         dir = "";
00112         file = buffer;
00113         }
00114     
00115     buffer3[0] = 0;
00116     /* start at the end of the fileName (minus .nib) */
00117     for(ptr = &file[strlen(file) - 5]; ; )
00118         {
00119         strcpy(buffer2, buffer3);
00120         if (isdigit(*ptr))
00121             {
00122             /* if we have a digit in the fileName, see if there is a directory with this name */
00123             safef(buffer3, sizeof(buffer3), "%c/%s",*ptr,buffer2);
00124             ptr--;
00125             }
00126         else
00127             /* we've run out of digits in the fileName, just add 0's */
00128             safef(buffer3, sizeof(buffer3), "0/%s",buffer2);
00129 
00130         /* check to see if this directory exists */
00131         safef(buffer2, sizeof(buffer2), "%s/%s", dir, buffer3);
00132         if (stat(buffer2, &statBuf) < 0)
00133             break;
00134 
00135         /* directory exists, see if our file is down there */
00136         safef(buffer2, sizeof(buffer2), "%s/%s/%s", dir, buffer3, file);
00137         if  ((f = fopen(buffer2, "rb")) != NULL)
00138             break;
00139         }
00140     if (f == NULL)
00141         errAbort("Can't open %s to read: %s", fileName,  strerror(errno));
00142     }
00143 dnaUtilOpen();
00144 mustReadOne(f, sig);
00145 mustReadOne(f, size);
00146 if (sig != nibSig)
00147     {
00148     sig = byteSwap32(sig);
00149     size = byteSwap32(size);
00150     if (sig != nibSig)
00151         errAbort("%s is not a good .nib file.",  fileName);
00152     }
00153 *retSize = size;
00154 *retFile = f;
00155 }
00156 
00157 static struct dnaSeq *nibInput(int options, char *fileName, char *seqName,
00158                                FILE *f, int seqSize, int start, int size)
00159 /* Load part of an open .nib file. */
00160 {
00161 int end;
00162 DNA *d;
00163 int bVal;
00164 DNA *valToNtTbl = ((options &  NIB_MASK_MIXED) ? valToNtMasked : valToNt);
00165 struct dnaSeq *seq;
00166 Bits* mask = NULL;
00167 int bytePos, byteSize;
00168 int maskIdx = 0;
00169 
00170 assert(start >= 0);
00171 assert(size >= 0);
00172 
00173 end = start+size;
00174 if (end > seqSize)
00175     errAbort("nib read past end of file (%d %d) in file: %s", 
00176              end, seqSize, (fileName != NULL ? fileName : "(NULL)"));
00177 
00178 AllocVar(seq);
00179 seq->size = size;
00180 seq->name = cloneString(seqName);
00181 seq->dna = d = needLargeMem(size+1);
00182 if (options & NIB_MASK_MIXED)
00183     seq->mask = mask = bitAlloc(size);
00184 
00185 bytePos = (start>>1);
00186 fseek(f, bytePos + 2*sizeof(bits32), SEEK_SET);
00187 if (start & 1)
00188     {
00189     bVal = getc_unlocked(f);
00190     if (bVal < 0)
00191         {
00192         errAbort("Read error 1 in %s", fileName);
00193         }
00194     *d++ = valToNtTbl[(bVal&0xf)];
00195     size -= 1;
00196     if (mask != NULL)
00197         {
00198         if ((bVal&0xf&MASKED_BASE_BIT) == 0)
00199             bitSetOne(mask, maskIdx);
00200         maskIdx++;
00201         }
00202     }
00203 byteSize = (size>>1);
00204 while (--byteSize >= 0)
00205     {
00206     bVal = getc_unlocked(f);
00207     if (bVal < 0)
00208         errAbort("Read error 2 in %s", fileName);
00209     d[0] = valToNtTbl[(bVal>>4)];
00210     d[1] = valToNtTbl[(bVal&0xf)];
00211     d += 2;
00212     if (mask != NULL)
00213         {
00214         if (((bVal>>4)&0xf) == 0)
00215             bitSetOne(mask, maskIdx);
00216         if ((bVal&0xf) == 0)
00217             bitSetOne(mask, maskIdx+1);
00218         maskIdx += 2;
00219         }
00220     }
00221 if (size&1)
00222     {
00223     bVal = getc_unlocked(f);
00224     if (bVal < 0)
00225         errAbort("Read error 3 in %s", fileName);
00226     *d++ = valToNtTbl[(bVal>>4)];
00227     if (mask != NULL)
00228         {
00229         if ((bVal>>4) == 0)
00230             bitSetOne(mask, maskIdx);
00231         maskIdx++;
00232         }
00233     }
00234 *d = 0;
00235 return seq;
00236 }
00237 
00238 static void nibOutput(int options, struct dnaSeq *seq, char *fileName)
00239 /* Write out file in format of four bits per nucleotide, with control over
00240  * handling of masked positions. */
00241 {
00242 UBYTE byte;
00243 DNA *dna = seq->dna;
00244 int dVal1, dVal2;
00245 bits32 size = seq->size;
00246 int byteCount = (size>>1);
00247 bits32 sig = nibSig;
00248 int *ntValTbl = ((options & NIB_MASK_MIXED) ? ntValMasked : ntVal5);
00249 Bits* mask = ((options & NIB_MASK_MAP) ? seq->mask : NULL);
00250 int maskIdx = 0;
00251 FILE *f = mustOpen(fileName, "w");
00252 
00253 assert(sizeof(bits32) == 4);
00254 
00255 writeOne(f, sig);
00256 writeOne(f, seq->size);
00257 
00258 printf("Writing %d bases in %d bytes\n", seq->size, ((seq->size+1)/2) + 8);
00259 while (--byteCount >= 0)
00260     {
00261     dVal1 = ntValTbl[(int)dna[0]];
00262     dVal2 = ntValTbl[(int)dna[1]];
00263     /* Set from mask, remember bit in character is opposite sense of bit
00264      * in mask. */
00265     if (mask != NULL)
00266         {
00267         if (!bitReadOne(mask, maskIdx))
00268             dVal1 |= MASKED_BASE_BIT;
00269         if (!bitReadOne(mask, maskIdx+1))
00270             dVal2 |= MASKED_BASE_BIT;
00271         maskIdx += 2;
00272         }
00273     byte = (dVal1<<4) | dVal2;
00274     if (putc(byte, f) < 0)
00275         {
00276         perror("");
00277         errAbort("Couldn't write all of %s", fileName);
00278         }
00279     dna += 2;
00280     }
00281 if (size & 1)
00282     {
00283     dVal1 = ntValTbl[(int)dna[0]];
00284     if ((mask != NULL) && !bitReadOne(mask, maskIdx))
00285         dVal1 |= MASKED_BASE_BIT;
00286     byte = (dVal1<<4);
00287     putc(byte, f);
00288     }
00289 carefulClose(&f);
00290 }
00291 
00292 struct dnaSeq *nibLdPartMasked(int options, char *fileName, FILE *f, int seqSize, int start, int size)
00293 /* Load part of an open .nib file, with control over handling of masked
00294  * positions. */
00295 {
00296 char nameBuf[512];
00297 safef(nameBuf, sizeof(nameBuf), "%s:%d-%d", fileName, start, start+size);
00298 return nibInput(options, fileName, nameBuf, f, seqSize, start, size);
00299 }
00300 
00301 struct dnaSeq *nibLdPart(char *fileName, FILE *f, int seqSize, int start, int size)
00302 /* Load part of an open .nib file. */
00303 {
00304 return nibLdPartMasked(0, fileName, f, seqSize, start, size);
00305 }
00306 
00307 struct dnaSeq *nibLoadPartMasked(int options, char *fileName, int start, int size)
00308 /* Load part of an .nib file, with control over handling of masked positions */
00309 {
00310 struct dnaSeq *seq;
00311 FILE *f;
00312 int seqSize;
00313 nibOpenVerify(fileName, &f, &seqSize);
00314 seq = nibLdPartMasked(options, fileName, f, seqSize, start, size);
00315 fclose(f);
00316 return seq;
00317 }
00318 
00319 struct dnaSeq *nibLoadPart(char *fileName, int start, int size)
00320 /* Load part of an .nib file. */
00321 {
00322 return nibLoadPartMasked(0, fileName, start, size);
00323 }
00324 
00325 struct dnaSeq *nibLoadAllMasked(int options, char *fileName)
00326 /* Load part of a .nib file, with control over handling of masked
00327  * positions. Subranges of nib files may specified in the file name
00328  * using the syntax:
00329  *    /path/file.nib:seqid:start-end
00330  * or\n"
00331  *    /path/file.nib:start-end
00332  * With the first form, seqid becomes the id of the subrange, with the second
00333  * form, a sequence id of file:start-end will be used.
00334  */
00335 {
00336 struct dnaSeq *seq;
00337 FILE *f;
00338 int seqSize;
00339 char filePath[PATH_LEN];
00340 char name[PATH_LEN];
00341 unsigned start, end;
00342 
00343 nibParseName(options, fileName, filePath, name, &start, &end);
00344 nibOpenVerify(filePath, &f, &seqSize);
00345 if (end == 0)
00346     end = seqSize;
00347 seq = nibInput(options, fileName, name, f, seqSize, start, end-start);
00348 fclose(f);
00349 return seq;
00350 }
00351 
00352 struct dnaSeq *nibLoadAll(char *fileName)
00353 /* Load part of an .nib file. */
00354 {
00355 return nibLoadAllMasked(0, fileName);
00356 }
00357 
00358 void nibWriteMasked(int options, struct dnaSeq *seq, char *fileName)
00359 /* Write out file in format of four bits per nucleotide, with control over
00360  * handling of masked positions. */
00361 {
00362     nibOutput(options, seq, fileName);
00363 }
00364 
00365 void nibWrite(struct dnaSeq *seq, char *fileName)
00366 /* Write out file in format of four bits per nucleotide. */
00367 {
00368     nibWriteMasked(0, seq, fileName);
00369 }
00370 
00371 struct nibStream
00372 /* Struct to help write a nib file one base at a time. 
00373  * The routines that do this aren't very fast, but they
00374  * aren't used much currently. */
00375     {
00376     struct nibStream *next;
00377     char *fileName;     /* Name of file - allocated here. */
00378     FILE *f;            /* File handle. */
00379     bits32 size;        /* Current size. */
00380     UBYTE byte;         /* Two nibble's worth of data. */
00381     };
00382 
00383 struct nibStream *nibStreamOpen(char *fileName)
00384 /* Create a new nib stream.  Open file and stuff. */
00385 {
00386 struct nibStream *ns;
00387 FILE *f;
00388 
00389 dnaUtilOpen();
00390 AllocVar(ns);
00391 ns->f = f = mustOpen(fileName, "wb");
00392 ns->fileName = cloneString(fileName);
00393 
00394 /* Write header - initially zero.  Will fix it up when we close. */
00395 writeOne(f, ns->size);
00396 writeOne(f, ns->size);
00397 
00398 return ns;
00399 }
00400 
00401 void nibStreamClose(struct nibStream **pNs)
00402 /* Close a nib stream.  Flush last nibble if need be.  Fix up header. */
00403 {
00404 struct nibStream *ns = *pNs;
00405 FILE *f;
00406 bits32 sig = nibSig;
00407 if (ns == NULL)
00408     return;
00409 f = ns->f;
00410 if (ns->size&1)
00411     writeOne(f, ns->byte);
00412 fseek(f,  0L, SEEK_SET);
00413 writeOne(f, sig);
00414 writeOne(f, ns->size);
00415 fclose(f);
00416 freeMem(ns->fileName);
00417 freez(pNs);
00418 }
00419 
00420 void nibStreamOne(struct nibStream *ns, DNA base)
00421 /* Write out one base to nibStream. */
00422 {
00423 UBYTE ub = ntVal5[(int)base];
00424 
00425 if ((++ns->size&1) == 0)
00426     {
00427     ub += ns->byte;
00428     writeOne(ns->f, ub);
00429     }
00430 else
00431     {
00432     ns->byte = (ub<<4);
00433     }
00434 }
00435 
00436 void nibStreamMany(struct nibStream *ns, DNA *dna, int size)
00437 /* Write many bases to nibStream. */
00438 {
00439 int i;
00440 for (i=0; i<size; ++i)
00441     nibStreamOne(ns, *dna++);
00442 }
00443 
00444 boolean nibIsFile(char *fileName)
00445 /* Return TRUE if file is a nib file. */
00446 {
00447 boolean isANib;
00448 char *subrange = findNibSubrange(fileName);
00449 if (subrange != NULL)
00450     *subrange = '\0';
00451 isANib = endsWith(fileName, ".nib") || endsWith(fileName, ".NIB");
00452 if (subrange != NULL)
00453     *subrange = ':';
00454 return isANib;
00455 }
00456 
00457 boolean nibIsRange(char *fileName)
00458 /* Return TRUE if file specifies a subrange of a nib file. */
00459 {
00460 boolean isANib;
00461 char *subrange = findNibSubrange(fileName);;
00462 if (subrange == NULL)
00463     return FALSE;
00464 *subrange = '\0';
00465 isANib = endsWith(fileName, ".nib") || endsWith(fileName, ".NIB");
00466 *subrange = ':';
00467 return isANib;
00468 }
00469 
00470 struct nibInfo *nibInfoNew(char *path)
00471 /* Make a new nibInfo with open nib file. */
00472 {
00473 struct nibInfo *nib;
00474 AllocVar(nib);
00475 nib->fileName = cloneString(path);
00476 nibOpenVerify(path, &nib->f, &nib->size);
00477 return nib;
00478 }
00479 
00480 void nibInfoFree(struct nibInfo **pNib)
00481 /* Free up nib info and close file if open. */
00482 {
00483 struct nibInfo *nib = *pNib;
00484 if (nib != NULL)
00485     {
00486     carefulClose(&nib->f);
00487     freeMem(nib->fileName);
00488     freez(pNib);
00489     }
00490 }
00491 
00492 struct nibInfo *nibInfoFromCache(struct hash *hash, char *nibDir, char *nibName)
00493 /* Get nibInfo on nibDir/nibName.nib from cache, filling cache if need be. */
00494 {
00495 struct nibInfo *nib;
00496 char path[PATH_LEN];
00497 safef(path, sizeof(path), "%s/%s.nib", nibDir, nibName);
00498 nib = hashFindVal(hash, path);
00499 if (nib == NULL)
00500     {
00501     nib = nibInfoNew(path);
00502     hashAdd(hash, path, nib);
00503     }
00504 return nib;
00505 }
00506 
00507 int nibGetSize(char* nibFile)
00508 /* Get the sequence length of a nib */
00509 {
00510 FILE* fh;
00511 int size;
00512 
00513 nibOpenVerify(nibFile, &fh, &size);
00514 carefulClose(&fh);
00515 return size;
00516 }
00517 

Generated on Tue Dec 25 18:39:31 2007 for blat by  doxygen 1.5.2