00001 /* twoBit - DNA sequence represented as two bits per pixel 00002 * with associated list of regions containing N's, and 00003 * masked regions. */ 00004 00005 #ifndef TWOBIT_H 00006 #define TWOBIT_H 00007 00008 struct twoBit 00009 /* Two bit representation of DNA. */ 00010 { 00011 struct twoBit *next; /* Next sequence in list */ 00012 char *name; /* Name of sequence. */ 00013 UBYTE *data; /* DNA at two bits per base. */ 00014 bits32 size; /* Size of this sequence. */ 00015 bits32 nBlockCount; /* Count of blocks of Ns. */ 00016 bits32 *nStarts; /* Starts of blocks of Ns. */ 00017 bits32 *nSizes; /* Sizes of blocks of Ns. */ 00018 bits32 maskBlockCount; /* Count of masked blocks. */ 00019 bits32 *maskStarts; /* Starts of masked regions. */ 00020 bits32 *maskSizes; /* Sizes of masked regions. */ 00021 bits32 reserved; /* Reserved for future expansion. */ 00022 }; 00023 00024 struct twoBitIndex 00025 /* An entry in twoBit index. */ 00026 { 00027 struct twoBitIndex *next; /* Next in list. */ 00028 char *name; /* Name - allocated in hash */ 00029 bits32 offset; /* Offset in file. */ 00030 }; 00031 00032 struct twoBitFile 00033 /* Holds header and index info from .2bit file. */ 00034 { 00035 struct twoBitFile *next; 00036 char *fileName; /* Name of this file, for error reporting. */ 00037 FILE *f; /* Open file. */ 00038 boolean isSwapped; /* Is byte-swapping needed. */ 00039 bits32 version; /* Version of .2bit file */ 00040 bits32 seqCount; /* Number of sequences. */ 00041 bits32 reserved; /* Reserved, always zero for now. */ 00042 struct twoBitIndex *indexList; /* List of sequence. */ 00043 struct hash *hash; /* Hash of sequences. */ 00044 }; 00045 00046 struct twoBitSpec 00047 /* parsed .2bit file and sequence specs */ 00048 { 00049 char *fileName; /* path to file */ 00050 struct twoBitSeqSpec *seqs; /* list of sequences and subsequences */ 00051 }; 00052 00053 struct twoBitSeqSpec 00054 /* specification for a seq or subsequence in a .2bit file */ 00055 { 00056 struct twoBitSeqSpec *next; 00057 char *name; /* name of sequence */ 00058 bits32 start; /* start of subsequence 0 */ 00059 bits32 end; /* end of subsequence; 00060 * 0 if not a subsequence */ 00061 }; 00062 00063 struct twoBitFile *twoBitOpen(char *fileName); 00064 /* Open file, read in header and index. 00065 * Squawk and die if there is a problem. */ 00066 00067 void twoBitClose(struct twoBitFile **pTbf); 00068 /* Free up resources associated with twoBitFile. */ 00069 00070 int twoBitSeqSize(struct twoBitFile *tbf, char *name); 00071 /* Return size of sequence in two bit file in bases. */ 00072 00073 long long twoBitTotalSize(struct twoBitFile *tbf); 00074 /* Return total size of all sequences in two bit file. */ 00075 00076 struct dnaSeq *twoBitReadSeqFragExt(struct twoBitFile *tbf, char *name, 00077 int fragStart, int fragEnd, boolean doMask, int *retFullSize); 00078 /* Read part of sequence from .2bit file. To read full 00079 * sequence call with start=end=0. Sequence will be lower 00080 * case if doMask is false, mixed case (repeats in lower) 00081 * if doMask is true. */ 00082 00083 struct dnaSeq *twoBitReadSeqFrag(struct twoBitFile *tbf, char *name, 00084 int fragStart, int fragEnd); 00085 /* Read part of sequence from .2bit file. To read full 00086 * sequence call with start=end=0. Note that sequence will 00087 * be mixed case, with repeats in lower case and rest in 00088 * upper case. */ 00089 00090 struct dnaSeq *twoBitReadSeqFragLower(struct twoBitFile *tbf, char *name, 00091 int fragStart, int fragEnd); 00092 /* Same as twoBitReadSeqFrag, but sequence is returned in lower case. */ 00093 00094 struct dnaSeq *twoBitLoadAll(char *spec); 00095 /* Return list of all sequences matching spec, which is in 00096 * the form: 00097 * 00098 * file/path/input.2bit[:seqSpec1][,seqSpec2,...] 00099 * 00100 * where seqSpec is either 00101 * seqName 00102 * or 00103 * seqName:start-end */ 00104 00105 struct slName *twoBitSeqNames(char *fileName); 00106 /* Get list of all sequences in twoBit file. */ 00107 00108 struct twoBit *twoBitFromDnaSeq(struct dnaSeq *seq, boolean doMask); 00109 /* Convert dnaSeq representation in memory to twoBit representation. 00110 * If doMask is true interpret lower-case letters as masked. */ 00111 00112 struct twoBit *twoBitFromFile(char *fileName); 00113 /* Get twoBit list of all sequences in twoBit file. */ 00114 00115 void twoBitWriteOne(struct twoBit *twoBit, FILE *f); 00116 /* Write out one twoBit sequence to binary file. 00117 * Note this does not include the name, which is 00118 * stored only in index. */ 00119 00120 void twoBitWriteHeader(struct twoBit *twoBitList, FILE *f); 00121 /* Write out header portion of twoBit file, including initial 00122 * index */ 00123 00124 boolean twoBitIsFile(char *fileName); 00125 /* Return TRUE if file is in .2bit format. */ 00126 00127 boolean twoBitParseRange(char *rangeSpec, char **retFile, 00128 char **retSeq, int *retStart, int *retEnd); 00129 /* Parse out something in format 00130 * file/path/name:seqName:start-end 00131 * or 00132 * file/path/name:seqName 00133 * This will destroy the input 'rangeSpec' in the process. 00134 * Returns FALSE if it doesn't fit this format. 00135 * If it is the shorter form then start and end will both 00136 * be returned as zero, which is ok by twoBitReadSeqFrag. */ 00137 00138 boolean twoBitIsRange(char *rangeSpec); 00139 /* Return TRUE if it looks like a two bit range specifier. */ 00140 00141 boolean twoBitIsFileOrRange(char *spec); 00142 /* Return TRUE if it is a two bit file or subrange. */ 00143 00144 boolean twoBitIsSpec(char *spec); 00145 /* Return TRUE spec is a valid 2bit spec (see twoBitSpecNew) */ 00146 00147 struct twoBitSpec *twoBitSpecNew(char *specStr); 00148 /* Parse a .2bit file and sequence spec into an object. 00149 * The spec is a string in the form: 00150 * 00151 * file/path/input.2bit[:seqSpec1][,seqSpec2,...] 00152 * 00153 * where seqSpec is either 00154 * seqName 00155 * or 00156 * seqName:start-end 00157 * 00158 * free result with twoBitSpecFree(). 00159 */ 00160 00161 struct twoBitSpec *twoBitSpecNewFile(char *twoBitFile, char *specFile); 00162 /* parse a file containing a list of specifications for sequences in the 00163 * specified twoBit file. Specifications are one per line in forms: 00164 * seqName 00165 * or 00166 * seqName:start-end 00167 */ 00168 00169 void twoBitSpecFree(struct twoBitSpec **specPtr); 00170 /* free a twoBitSpec object */ 00171 00172 void twoBitOutNBeds(struct twoBitFile *tbf, char *seqName, FILE *outF); 00173 /* output a series of bed3's that enumerate the number of N's in a sequence*/ 00174 00175 int twoBitSeqSizeNoNs(struct twoBitFile *tbf, char *seqName); 00176 /* return the length of the sequence, not counting N's */ 00177 00178 #endif /* TWOBIT_H */
1.5.2