inc/twoBit.h

Go to the documentation of this file.
00001 /* twoBit - DNA sequence represented as two bits per pixel
00002  * with associated list of regions containing N's, and
00003  * masked regions. */
00004 
00005 #ifndef TWOBIT_H
00006 #define TWOBIT_H
00007 
00008 struct twoBit
00009 /* Two bit representation of DNA. */
00010     {
00011     struct twoBit *next;        /* Next sequence in list */
00012     char *name;                 /* Name of sequence. */
00013     UBYTE *data;                /* DNA at two bits per base. */
00014     bits32 size;                /* Size of this sequence. */
00015     bits32 nBlockCount;         /* Count of blocks of Ns. */
00016     bits32 *nStarts;            /* Starts of blocks of Ns. */
00017     bits32 *nSizes;             /* Sizes of blocks of Ns. */
00018     bits32 maskBlockCount;      /* Count of masked blocks. */
00019     bits32 *maskStarts;         /* Starts of masked regions. */
00020     bits32 *maskSizes;          /* Sizes of masked regions. */
00021     bits32 reserved;            /* Reserved for future expansion. */
00022     };
00023 
00024 struct twoBitIndex
00025 /* An entry in twoBit index. */
00026     {
00027     struct twoBitIndex *next;   /* Next in list. */
00028     char *name;                 /* Name - allocated in hash */
00029     bits32 offset;              /* Offset in file. */
00030     };
00031 
00032 struct twoBitFile
00033 /* Holds header and index info from .2bit file. */
00034     {
00035     struct twoBitFile *next;
00036     char *fileName;     /* Name of this file, for error reporting. */
00037     FILE *f;            /* Open file. */
00038     boolean isSwapped;  /* Is byte-swapping needed. */
00039     bits32 version;     /* Version of .2bit file */
00040     bits32 seqCount;    /* Number of sequences. */
00041     bits32 reserved;    /* Reserved, always zero for now. */
00042     struct twoBitIndex *indexList;      /* List of sequence. */
00043     struct hash *hash;  /* Hash of sequences. */
00044     };
00045 
00046 struct twoBitSpec
00047 /* parsed .2bit file and sequence specs */
00048 {
00049     char *fileName;                 /* path to file */
00050     struct twoBitSeqSpec *seqs;     /* list of sequences and subsequences */
00051 };
00052 
00053 struct twoBitSeqSpec
00054 /* specification for a seq or subsequence in a .2bit file */
00055 {
00056     struct twoBitSeqSpec *next;
00057     char *name;                 /* name of sequence */
00058     bits32 start;              /* start of subsequence 0 */
00059     bits32 end;                /* end of subsequence;
00060                                  * 0 if not a subsequence */
00061 };
00062 
00063 struct twoBitFile *twoBitOpen(char *fileName);
00064 /* Open file, read in header and index.  
00065  * Squawk and die if there is a problem. */
00066 
00067 void twoBitClose(struct twoBitFile **pTbf);
00068 /* Free up resources associated with twoBitFile. */
00069 
00070 int twoBitSeqSize(struct twoBitFile *tbf, char *name);
00071 /* Return size of sequence in two bit file in bases. */
00072 
00073 long long twoBitTotalSize(struct twoBitFile *tbf);
00074 /* Return total size of all sequences in two bit file. */
00075 
00076 struct dnaSeq *twoBitReadSeqFragExt(struct twoBitFile *tbf, char *name,
00077                                     int fragStart, int fragEnd, boolean doMask, int *retFullSize);
00078 /* Read part of sequence from .2bit file.  To read full
00079  * sequence call with start=end=0.  Sequence will be lower
00080  * case if doMask is false, mixed case (repeats in lower)
00081  * if doMask is true. */
00082 
00083 struct dnaSeq *twoBitReadSeqFrag(struct twoBitFile *tbf, char *name,
00084         int fragStart, int fragEnd);
00085 /* Read part of sequence from .2bit file.  To read full
00086  * sequence call with start=end=0.  Note that sequence will
00087  * be mixed case, with repeats in lower case and rest in
00088  * upper case. */
00089 
00090 struct dnaSeq *twoBitReadSeqFragLower(struct twoBitFile *tbf, char *name,
00091         int fragStart, int fragEnd);
00092 /* Same as twoBitReadSeqFrag, but sequence is returned in lower case. */
00093 
00094 struct dnaSeq *twoBitLoadAll(char *spec);
00095 /* Return list of all sequences matching spec, which is in
00096  * the form:
00097  *
00098  *    file/path/input.2bit[:seqSpec1][,seqSpec2,...]
00099  *
00100  * where seqSpec is either
00101  *     seqName
00102  *  or
00103  *     seqName:start-end */
00104 
00105 struct slName *twoBitSeqNames(char *fileName);
00106 /* Get list of all sequences in twoBit file. */
00107 
00108 struct twoBit *twoBitFromDnaSeq(struct dnaSeq *seq, boolean doMask);
00109 /* Convert dnaSeq representation in memory to twoBit representation.
00110  * If doMask is true interpret lower-case letters as masked. */
00111 
00112 struct twoBit *twoBitFromFile(char *fileName);
00113 /* Get twoBit list of all sequences in twoBit file. */
00114 
00115 void twoBitWriteOne(struct twoBit *twoBit, FILE *f);
00116 /* Write out one twoBit sequence to binary file. 
00117  * Note this does not include the name, which is
00118  * stored only in index. */
00119 
00120 void twoBitWriteHeader(struct twoBit *twoBitList, FILE *f);
00121 /* Write out header portion of twoBit file, including initial
00122  * index */
00123 
00124 boolean twoBitIsFile(char *fileName);
00125 /* Return TRUE if file is in .2bit format. */
00126 
00127 boolean twoBitParseRange(char *rangeSpec, char **retFile, 
00128         char **retSeq, int *retStart, int *retEnd);
00129 /* Parse out something in format
00130  *    file/path/name:seqName:start-end
00131  * or
00132  *    file/path/name:seqName
00133  * This will destroy the input 'rangeSpec' in the process.
00134  * Returns FALSE if it doesn't fit this format. 
00135  * If it is the shorter form then start and end will both
00136  * be returned as zero, which is ok by twoBitReadSeqFrag. */
00137 
00138 boolean twoBitIsRange(char *rangeSpec);
00139 /* Return TRUE if it looks like a two bit range specifier. */
00140 
00141 boolean twoBitIsFileOrRange(char *spec);
00142 /* Return TRUE if it is a two bit file or subrange. */
00143 
00144 boolean twoBitIsSpec(char *spec);
00145 /* Return TRUE spec is a valid 2bit spec (see twoBitSpecNew) */
00146 
00147 struct twoBitSpec *twoBitSpecNew(char *specStr);
00148 /* Parse a .2bit file and sequence spec into an object.
00149  * The spec is a string in the form:
00150  *
00151  *    file/path/input.2bit[:seqSpec1][,seqSpec2,...]
00152  *
00153  * where seqSpec is either
00154  *     seqName
00155  *  or
00156  *     seqName:start-end
00157  *
00158  * free result with twoBitSpecFree().
00159  */
00160 
00161 struct twoBitSpec *twoBitSpecNewFile(char *twoBitFile, char *specFile);
00162 /* parse a file containing a list of specifications for sequences in the
00163  * specified twoBit file. Specifications are one per line in forms:
00164  *     seqName
00165  *  or
00166  *     seqName:start-end
00167  */
00168 
00169 void twoBitSpecFree(struct twoBitSpec **specPtr);
00170 /* free a twoBitSpec object */
00171 
00172 void twoBitOutNBeds(struct twoBitFile *tbf, char *seqName, FILE *outF);
00173 /* output a series of bed3's that enumerate the number of N's in a sequence*/
00174 
00175 int twoBitSeqSizeNoNs(struct twoBitFile *tbf, char *seqName);
00176 /* return the length of the sequence, not counting N's */
00177 
00178 #endif /* TWOBIT_H */

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2