lib/dnaLoad.c

Go to the documentation of this file.
00001 /* dnaLoad - Load dna from a variaty of file formats. */
00002 
00003 #include "common.h"
00004 #include "dnaseq.h"
00005 #include "fa.h"
00006 #include "twoBit.h"
00007 #include "nib.h"
00008 #include "dnaLoad.h"
00009 
00010 static char const rcsid[] = "$Id: dnaLoad.c,v 1.8 2005/03/28 17:21:28 hiram Exp $";
00011 
00012 struct dnaLoadStack
00013 /* Keep track of a single DNA containing file. */
00014     {
00015     struct dnaLoadStack *next;  /* Next in list. */
00016     struct twoBitFile *twoBit; /* Two bit file if any. */
00017     struct twoBitIndex *tbi;     /* Next twoBit sequence. */
00018     struct lineFile *textFile;  /* Text file if any. */
00019     boolean textIsFa;           /* True if text is in fasta format. */
00020     };
00021 
00022 struct dnaLoad
00023 /* A structure to help us load DNA from files - mixed case
00024  * from either .fa, .nib, or .2bit files */
00025     {
00026     struct dnaLoad *next;       /* Next loader in list. */
00027     char *topFileName;          /* Highest level file name. */
00028     boolean finished;           /* Set to TRUE at end. */
00029     struct dnaLoadStack *stack; /* Stack of files we're working on. */
00030     int curStart;               /* Start offset within current parent sequence. */
00031     int curEnd;                 /* End offset  within current parent sequence. */
00032     int curSize;                /* Size of current parent sequence. */
00033     };
00034 
00035 struct dnaLoadStack *dnaLoadStackNew(char *fileName)
00036 /* Create new dnaLoadStack on composite file. */
00037 {
00038 struct dnaLoadStack *dls;
00039 AllocVar(dls);
00040 if (twoBitIsFile(fileName))
00041     {
00042     dls->twoBit = twoBitOpen(fileName);
00043     dls->tbi = dls->twoBit->indexList;
00044     }
00045 else
00046     {
00047     char *line;
00048     dls->textFile = lineFileOpen(fileName, TRUE);
00049     if (lineFileNextReal(dls->textFile, &line))
00050         {
00051         line = trimSpaces(line);
00052         if (line[0] == '>')
00053             dls->textIsFa = TRUE;
00054         lineFileReuse(dls->textFile);
00055         }
00056     }
00057 return dls;
00058 }
00059 
00060 void dnaLoadStackFree(struct dnaLoadStack **pDls)
00061 /* free up resources associated with dnaLoadStack. */
00062 {
00063 struct dnaLoadStack *dls = *pDls;
00064 if (dls != NULL)
00065     {
00066     lineFileClose(&dls->textFile);
00067     twoBitClose(&dls->twoBit);
00068     freez(pDls);
00069     }
00070 }
00071 
00072 void dnaLoadStackFreeList(struct dnaLoadStack **pList)
00073 /* Free a list of dynamically allocated dnaLoadStack's */
00074 {
00075 struct dnaLoadStack *el, *next;
00076 
00077 for (el = *pList; el != NULL; el = next)
00078     {
00079     next = el->next;
00080     dnaLoadStackFree(&el);
00081     }
00082 *pList = NULL;
00083 }
00084 
00085 void dnaLoadClose(struct dnaLoad **pDl)
00086 /* Free up resources associated with dnaLoad. */
00087 {
00088 struct dnaLoad *dl = *pDl;
00089 if (dl != NULL)
00090     {
00091     dnaLoadStackFreeList(&dl->stack);
00092     freeMem(dl->topFileName);
00093     freez(pDl);
00094     }
00095 }
00096 
00097 struct dnaLoad *dnaLoadOpen(char *fileName)
00098 /* Return new DNA loader.  Call dnaLoadNext() on this until
00099  * you get a NULL return, then dnaLoadClose(). */
00100 {
00101 struct dnaLoad *dl;
00102 AllocVar(dl);
00103 dl->topFileName = cloneString(fileName);
00104 return dl;
00105 }
00106 
00107 struct dnaSeq *dnaLoadSingle(char *fileName, int *retStart, int *retEnd, int *retParentSize)
00108 /* Return sequence if it's a nib file or 2bit part, NULL otherwise. */
00109 {
00110 struct dnaSeq *seq = NULL;
00111 unsigned start = 0, end = 0;
00112 int parentSize = 0;
00113 if (nibIsFile(fileName))
00114     {
00115     /* Save offset out of fileName for auto-lifting */
00116     char filePath[PATH_LEN];
00117     char name[PATH_LEN];
00118     nibParseName(0, fileName, filePath, name, &start, &end);
00119 
00120     if (end != 0)       /* It's just a range. */
00121         {
00122         FILE *f;
00123         int size;
00124         nibOpenVerify(filePath, &f, &size);
00125         parentSize = size;
00126         }
00127     seq =  nibLoadAllMasked(NIB_MASK_MIXED, fileName);
00128     if (end == 0)
00129          parentSize = end = seq->size;
00130     freez(&seq->name);
00131     seq->name = cloneString(name);
00132     }
00133 else if (twoBitIsRange(fileName))
00134     {
00135     /* Save offset out of fileName for auto-lifting */
00136     char *rangeSpec = cloneString(fileName);
00137     int start, end;
00138     char *file, *seqName;
00139     twoBitParseRange(rangeSpec, &file, &seqName, &start, &end);
00140 
00141     /* Load sequence. */
00142         {
00143         struct twoBitFile *tbf = twoBitOpen(file);
00144         parentSize = twoBitSeqSize(tbf, seqName);
00145         seq = twoBitReadSeqFrag(tbf, seqName, start, end);
00146         twoBitClose(&tbf);
00147         }
00148     if (end == 0)
00149         end = seq->size;
00150     freez(&rangeSpec);
00151     }
00152 if (retStart != NULL)
00153     *retStart = start;
00154 if (retEnd != NULL)
00155     *retEnd = end;
00156 if (retParentSize != NULL)
00157     *retParentSize = parentSize;
00158 return seq;
00159 }
00160 
00161 static struct dnaSeq *dnaLoadNextFromStack(struct dnaLoad *dl)
00162 /* Load next piece of DNA from stack of files.  Return NULL
00163  * when stack is empty. */
00164 {
00165 struct dnaLoadStack *dls;
00166 struct dnaSeq *seq = NULL;
00167 while ((dls = dl->stack) != NULL)
00168     {
00169     if (dls->twoBit)
00170         {
00171         if (dls->tbi != NULL)
00172             {
00173             seq = twoBitReadSeqFrag(dls->twoBit, dls->tbi->name, 0, 0);
00174             dls->tbi = dls->tbi->next;
00175             return seq;
00176             }
00177         else
00178             {
00179             dl->stack = dls->next;
00180             dnaLoadStackFree(&dls);
00181             }
00182         }
00183     else if (dls->textIsFa)
00184         {
00185         DNA *dna;
00186         char *name;
00187         int size;
00188         if (faMixedSpeedReadNext(dls->textFile, &dna, &size, &name))
00189             {
00190             AllocVar(seq);
00191             seq->dna = needLargeMem(size+1);
00192             memcpy((void *)seq->dna, (void *)dna, size);
00193             seq->dna[size] = 0;
00194             seq->size = size;
00195             seq->name = cloneString(name);
00196             dl->curStart = 0;
00197             dl->curEnd = size;
00198             dl->curSize = size;
00199             return seq;
00200             }
00201         else
00202             {
00203             dl->stack = dls->next;
00204             dnaLoadStackFree(&dls);
00205             }
00206         }
00207     else        /* It's a file full of file names. */
00208         {
00209         char *line;
00210         if (lineFileNextReal(dls->textFile, &line))
00211             {
00212             line  = trimSpaces(line);
00213             if ((seq = dnaLoadSingle(line, &dl->curStart, &dl->curEnd, &dl->curSize)) != NULL)
00214                  return seq;
00215             else
00216                  {
00217                  struct dnaLoadStack *newDls;
00218                  newDls = dnaLoadStackNew(line);
00219                  slAddHead(&dl->stack, newDls);
00220                  }
00221             }
00222         else
00223             {
00224             dl->stack = dls->next;
00225             dnaLoadStackFree(&dls);
00226             }
00227         }
00228     }
00229 dl->finished = TRUE;
00230 return NULL;
00231 }
00232 
00233 static struct dnaSeq *dnaLoadStackOrSingle(struct dnaLoad *dl)
00234 /* Return next dna sequence. */
00235 {
00236 struct dnaSeq *seq = NULL;
00237 if (dl->finished)
00238     return NULL;
00239 if (dl->stack == NULL)
00240     {
00241     if ((seq = dnaLoadSingle(dl->topFileName, &dl->curStart, &dl->curEnd, &dl->curSize)) != NULL)
00242         {
00243         dl->finished = TRUE;
00244         return seq;
00245         }
00246     dl->stack = dnaLoadStackNew(dl->topFileName);
00247     }
00248 return dnaLoadNextFromStack(dl);
00249 }
00250 
00251 struct dnaSeq *dnaLoadNext(struct dnaLoad *dl)
00252 /* Return next dna sequence. */
00253 {
00254 struct dnaSeq *seq;
00255 dl->curSize = dl->curStart = dl->curEnd = 0;
00256 seq = dnaLoadStackOrSingle(dl);
00257 return seq;
00258 }
00259 
00260 struct dnaSeq *dnaLoadAll(char *fileName)
00261 /* Return list of all DNA referenced in file.  File
00262  * can be either a single fasta file, a single .2bit
00263  * file, a .nib file, or a text file containing
00264  * a list of the above files. DNA is mixed case. */
00265 {
00266 struct dnaLoad *dl = dnaLoadOpen(fileName);
00267 struct dnaSeq *seqList = NULL, *seq;
00268 while ((seq = dnaLoadNext(dl)) != NULL)
00269     {
00270     slAddHead(&seqList, seq);
00271     }
00272 dnaLoadClose(&dl);
00273 slReverse(seqList);
00274 return seqList;
00275 }
00276 
00277 int dnaLoadCurStart(struct dnaLoad *dl)
00278 /* Returns the start offset of current sequence within a larger
00279  * sequence.  Useful for programs that want to auto-lift
00280  * nib and 2bit fragments.  Please call only after a
00281  * sucessful dnaLoadNext. */
00282 {
00283 return dl->curStart;
00284 }
00285 
00286 int dnaLoadCurEnd(struct dnaLoad *dl)
00287 /* Returns the end offset of current sequence within a larger
00288  * sequence.  Useful for programs that want to auto-lift
00289  * nib and 2bit fragments.  Please call only after a
00290  * sucessful dnaLoadNext. */
00291 {
00292 return dl->curEnd;
00293 }
00294 
00295 int dnaLoadCurSize(struct dnaLoad *dl)
00296 /* Returns the size of the parent sequence.  Useful for
00297  * auto-lift programs.  Please call only after dnaLoadNext. */
00298 {
00299 return dl->curSize;
00300 }
00301 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2