lib/fa.c

Go to the documentation of this file.
00001 /* Routines for reading and writing fasta format sequence files.
00002  *
00003  * This file is copyright 2002 Jim Kent, but license is hereby
00004  * granted for all use - public, private or commercial. */
00005 
00006 #include "common.h"
00007 #include "errabort.h"
00008 #include "hash.h"
00009 #include "portable.h"
00010 #include "dnautil.h"
00011 #include "dnaseq.h"
00012 #include "fa.h"
00013 #include "linefile.h"
00014 
00015 static char const rcsid[] = "$Id: fa.c,v 1.37 2007/02/20 18:50:21 kent Exp $";
00016 
00017 boolean faReadNext(FILE *f, char *defaultName, boolean mustStartWithComment,
00018                          char **retCommentLine, struct dnaSeq **retSeq) 
00019 /* Read next sequence from .fa file. Return sequence in retSeq.  
00020  * If retCommentLine is non-null
00021  * return the '>' line in retCommentLine.   
00022  * The whole thing returns FALSE at end of file.  
00023  * DNA chars are mapped to lower case.*/
00024 {
00025     return faReadMixedNext(f, 0, defaultName, mustStartWithComment,
00026                                         retCommentLine, retSeq);
00027 }
00028 
00029 boolean faReadMixedNext(FILE *f, boolean preserveCase, char *defaultName, 
00030     boolean mustStartWithComment, char **retCommentLine, struct dnaSeq **retSeq)
00031 /* Read next sequence from .fa file. Return sequence in retSeq.  
00032  * If retCommentLine is non-null return the '>' line in retCommentLine.
00033  * The whole thing returns FALSE at end of file. 
00034  * Contains parameter to preserve mixed case. */
00035 {
00036 char lineBuf[1024];
00037 int lineSize;
00038 char *words[1];
00039 int c;
00040 off_t offset = ftello(f);
00041 size_t dnaSize = 0;
00042 DNA *dna, *sequence;
00043 char *name = defaultName;
00044 
00045 if (name == NULL)
00046     name = "";
00047 dnaUtilOpen();
00048 if (retCommentLine != NULL)
00049     *retCommentLine = NULL;
00050 *retSeq = NULL;
00051 
00052 /* Skip first lines until it starts with '>' */
00053 for (;;)
00054     {
00055     if(fgets(lineBuf, sizeof(lineBuf), f) == NULL)
00056         {
00057         if (ferror(f))
00058             errnoAbort("read of fasta file failed");
00059         return FALSE;
00060         }
00061     lineSize = strlen(lineBuf);
00062     if (lineBuf[0] == '>')
00063         {
00064         if (retCommentLine != NULL)
00065             *retCommentLine = cloneString(lineBuf);
00066         offset = ftello(f);
00067         chopByWhite(lineBuf, words, ArraySize(words));
00068         name = words[0]+1;
00069         break;
00070         }
00071     else if (!mustStartWithComment)
00072         {
00073         if (fseeko(f, offset, SEEK_SET) < 0)
00074             errnoAbort("fseek on fasta file failed");
00075         break;
00076         }
00077     else
00078         offset += lineSize;
00079     }
00080 /* Count up DNA. */
00081 for (;;)
00082     {
00083     c = fgetc(f);
00084     if (c == EOF || c == '>')
00085         break;
00086     if (isalpha(c))
00087         {
00088         ++dnaSize;
00089         }
00090     }
00091 
00092 if (dnaSize == 0)
00093     {
00094     warn("Invalid fasta format: sequence size == 0 for element %s",name);
00095     }
00096 
00097 /* Allocate DNA and fill it up from file. */
00098 dna = sequence = needHugeMem(dnaSize+1);
00099 if (fseeko(f, offset, SEEK_SET) < 0)
00100     errnoAbort("fseek on fasta file failed");
00101 for (;;)
00102     {
00103     c = fgetc(f);
00104     if (c == EOF || c == '>')
00105         break;
00106     if (isalpha(c))
00107         {
00108         /* check for non-DNA char */
00109         if (ntChars[c] == 0)
00110             {
00111             *dna++ = preserveCase ? 'N' : 'n';
00112             }
00113         else
00114             {
00115             *dna++ = preserveCase ? c : ntChars[c];
00116             }
00117         }
00118     }
00119 if (c == '>')
00120     ungetc(c, f);
00121 *dna = 0;
00122 
00123 *retSeq = newDnaSeq(sequence, dnaSize, name);
00124 if (ferror(f))
00125     errnoAbort("read of fasta file failed");    
00126 return TRUE;
00127 }
00128 
00129 
00130 struct dnaSeq *faReadOneDnaSeq(FILE *f, char *defaultName, boolean mustStartWithComment)
00131 /* Read sequence from FA file. Assumes positioned at or before
00132  * the '>' at start of sequence. */  
00133 {
00134 struct dnaSeq *seq;
00135 if (!faReadNext(f, defaultName, mustStartWithComment, NULL, &seq))
00136     return NULL;
00137 else
00138     return seq;
00139 }
00140 
00141 static bioSeq *nextSeqFromMem(char **pText, boolean isDna, boolean doFilter)
00142 /* Convert fa in memory to bioSeq.  Update *pText to point to next
00143  * record.  Returns NULL when no more sequences left. */
00144 {
00145 char *name = "";
00146 char *s, *d;
00147 struct dnaSeq *seq;
00148 int size = 0;
00149 char c;
00150 char *filter = (isDna ? ntChars : aaChars);
00151 char *text = *pText;
00152 char *p = skipLeadingSpaces(text);
00153 if (p == NULL)
00154     return NULL;
00155 dnaUtilOpen();
00156 if (*p == '>')
00157     {
00158     char *end;
00159     s = strchr(p, '\n');
00160     if (s != NULL) ++s;
00161     name = skipLeadingSpaces(p+1);
00162     end = skipToSpaces(name);
00163     if (end >= s || name >= s)
00164         errAbort("No name in line starting with '>'");
00165     if (end != NULL)
00166         *end = 0;
00167     }
00168 else
00169     {
00170     s = p; 
00171     if (s == NULL || s[0] == 0)
00172         return NULL;
00173     }
00174 name = cloneString(name);
00175     
00176 d = text;
00177 if (s != NULL)
00178     {
00179     for (;;)
00180         {
00181         c = *s;
00182         if (c == 0 || c == '>')
00183             break;
00184         ++s;
00185         if (!isalpha(c))
00186             continue;
00187         if (doFilter)
00188             {
00189             if ((c = filter[(int)c]) == 0) 
00190                 {
00191                 if (isDna)
00192                     c = 'n';
00193                 else
00194                     c = 'X';
00195                 }
00196             }
00197         d[size++] = c;
00198         }
00199     }
00200 d[size] = 0;
00201 
00202 /* Put sequence into our little sequence structure. */
00203 AllocVar(seq);
00204 seq->name = name;
00205 seq->dna = text;
00206 seq->size = size;
00207 *pText = s;
00208 return seq;
00209 }
00210 
00211 bioSeq *faNextSeqFromMemText(char **pText, boolean isDna)
00212 /* Convert fa in memory to bioSeq.  Update *pText to point to next
00213  * record.  Returns NULL when no more sequences left. */
00214 {
00215 return nextSeqFromMem(pText, isDna, TRUE);
00216 }
00217 
00218 bioSeq *faNextSeqFromMemTextRaw(char **pText)
00219 /* Same as faNextSeqFromMemText, but will leave in 
00220  * letters even if they aren't in DNA or protein alphabed. */
00221 {
00222 return nextSeqFromMem(pText, TRUE, FALSE);
00223 }
00224 
00225 bioSeq *faSeqListFromMemText(char *text, boolean isDna)
00226 /* Convert fa's in memory into list of dnaSeqs. */
00227 {
00228 bioSeq *seqList = NULL, *seq;
00229 while ((seq = faNextSeqFromMemText(&text, isDna)) != NULL)
00230     {
00231     slAddHead(&seqList, seq);
00232     }
00233 slReverse(&seqList);
00234 return seqList;
00235 }
00236 
00237 bioSeq *faSeqListFromMemTextRaw(char *text)
00238 /* Convert fa's in memory into list of dnaSeqs without
00239  * converting chars to N's. */
00240 {
00241 bioSeq *seqList = NULL, *seq;
00242 while ((seq = faNextSeqFromMemTextRaw(&text)) != NULL)
00243     {
00244     slAddHead(&seqList, seq);
00245     }
00246 slReverse(&seqList);
00247 return seqList;
00248 }
00249 
00250 
00251 bioSeq *faSeqFromMemText(char *text, boolean isDna)
00252 /* Convert fa in memory to bioSeq. */
00253 {
00254 return faNextSeqFromMemText(&text, isDna);
00255 }
00256 
00257 struct dnaSeq *faFromMemText(char *text)
00258 /* Return a sequence from a .fa file that's been read into
00259  * a string in memory. This cannabalizes text, which should
00260  * be allocated with needMem.  This buffer becomes part of
00261  * the returned dnaSeq, which may be freed normally with
00262  * freeDnaSeq. */
00263 {
00264 return faNextSeqFromMemText(&text, TRUE);
00265 }
00266 
00267 struct dnaSeq *faReadSeq(char *fileName, boolean isDna)
00268 /* Open fa file and read a single sequence from it. */
00269 {
00270 int maxSize = fileSize(fileName);
00271 int fd;
00272 DNA *s;
00273 
00274 if (maxSize < 0)
00275     errAbort("can't open %s", fileName);
00276 s = needHugeMem(maxSize+1);
00277 fd = open(fileName, O_RDONLY);
00278 read(fd, s, maxSize);
00279 close(fd);
00280 s[maxSize] = 0;
00281 return faSeqFromMemText(s, isDna);
00282 }
00283 
00284 struct dnaSeq *faReadDna(char *fileName)
00285 /* Open fa file and read a single nucleotide sequence from it. */
00286 {
00287 return faReadSeq(fileName, TRUE);
00288 }
00289 
00290 struct dnaSeq *faReadAa(char *fileName)
00291 /* Open fa file and read a peptide single sequence from it. */
00292 {
00293 return faReadSeq(fileName, FALSE);
00294 }
00295 
00296 static unsigned faFastBufSize = 0;
00297 static DNA *faFastBuf;
00298 
00299 static void expandFaFastBuf(int bufPos, int minExp)
00300 /* Make faFastBuf bigger. */
00301 {
00302 if (faFastBufSize == 0)
00303     {
00304     faFastBufSize = 64 * 1024;
00305     while (minExp > faFastBufSize)
00306         faFastBufSize <<= 1;
00307     faFastBuf = needHugeMem(faFastBufSize);
00308     }
00309 else
00310     {
00311     DNA *newBuf;
00312     unsigned newBufSize = faFastBufSize + faFastBufSize;
00313     while (newBufSize < minExp)
00314         {
00315         newBufSize <<= 1;
00316         if (newBufSize <= 0)
00317             errAbort("expandFaFastBuf: integer overflow when trying to "
00318                      "increase buffer size from %u to a min of %u.",
00319                      faFastBufSize, minExp);
00320         }
00321     newBuf = needHugeMem(newBufSize);
00322     memcpy(newBuf, faFastBuf, bufPos);
00323     freeMem(faFastBuf);
00324     faFastBuf = newBuf;
00325     faFastBufSize = newBufSize;
00326     }
00327 }
00328 
00329 void faFreeFastBuf()
00330 /* Free up buffers used in fa fast and speedreading. */
00331 {
00332 freez(&faFastBuf);
00333 faFastBufSize = 0;
00334 }
00335 
00336 boolean faFastReadNext(FILE *f, DNA **retDna, int *retSize, char **retName)
00337 /* Read in next FA entry as fast as we can. Return FALSE at EOF. 
00338  * The returned DNA and name will be overwritten by the next call
00339  * to this function. */
00340 {
00341 int c;
00342 int bufIx = 0;
00343 static char name[256];
00344 int nameIx = 0;
00345 boolean gotSpace = FALSE;
00346 
00347 /* Seek to next '\n' and save first word as name. */
00348 dnaUtilOpen();
00349 name[0] = 0;
00350 for (;;)
00351     {
00352     if ((c = fgetc(f)) == EOF)
00353         {
00354         *retDna = NULL;
00355         *retSize = 0;
00356         *retName = NULL;
00357         return FALSE;
00358         }
00359     if (!gotSpace && nameIx < ArraySize(name)-1)
00360         {
00361         if (isspace(c))
00362             gotSpace = TRUE;
00363         else if (c != '>')
00364             {
00365             name[nameIx++] = c;
00366             }
00367         }
00368     if (c == '\n')
00369         break;
00370     }
00371 name[nameIx] = 0;
00372 /* Read until next '>' */
00373 for (;;)
00374     {
00375     c = fgetc(f);
00376     if (c == EOF || c == '>')
00377         c = 0;
00378     else if (!isalpha(c))
00379         continue;
00380     else
00381         {
00382         c = ntChars[c];
00383         if (c == 0) c = 'n';
00384         }
00385     if (bufIx >= faFastBufSize)
00386         expandFaFastBuf(bufIx, 0);
00387     faFastBuf[bufIx++] = c;
00388     if (c == 0)
00389         {
00390         *retDna = faFastBuf;
00391         *retSize = bufIx-1;
00392         *retName = name;
00393         return TRUE;
00394         }
00395     }
00396 }
00397 
00398 
00399 void faWriteNext(FILE *f, char *startLine, DNA *dna, int dnaSize)
00400 /* Write next sequence to fa file. */
00401 {
00402 if (dnaSize == 0)
00403     return;
00404 if (startLine != NULL)
00405     fprintf(f, ">%s\n", startLine);
00406 writeSeqWithBreaks(f, dna, dnaSize, 50);
00407 }
00408 
00409 void faWrite(char *fileName, char *startLine, DNA *dna, int dnaSize)
00410 /* Write out FA file or die trying. */
00411 {
00412 FILE *f = mustOpen(fileName, "w");
00413 faWriteNext(f, startLine, dna, dnaSize);
00414 if (fclose(f) != 0)
00415     errnoAbort("fclose failed");
00416 }
00417 
00418 void faWriteAll(char *fileName, bioSeq *seqList)
00419 /* Write out all sequences in list to file. */
00420 {
00421 FILE *f = mustOpen(fileName, "w");
00422 bioSeq *seq;
00423 
00424 for (seq=seqList; seq != NULL; seq = seq->next)
00425     faWriteNext(f, seq->name, seq->dna, seq->size);
00426 if (fclose(f) != 0)
00427     errnoAbort("fclose failed");
00428 }
00429 
00430 boolean faMixedSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
00431 /* Read in DNA or Peptide FA record in mixed case.   Allow any upper or lower case
00432  * letter, or the dash character in. */
00433 {
00434 char c;
00435 int bufIx = 0;
00436 static char name[512];
00437 int lineSize, i;
00438 char *line;
00439 
00440 dnaUtilOpen();
00441 
00442 /* Read first line, make sure it starts with '>', and read first word
00443  * as name of sequence. */
00444 name[0] = 0;
00445 if (!lineFileNext(lf, &line, &lineSize))
00446     {
00447     *retDna = NULL;
00448     *retSize = 0;
00449     return FALSE;
00450     }
00451 if (line[0] == '>')
00452     {
00453     line = firstWordInLine(skipLeadingSpaces(line+1));
00454     if (line == NULL)
00455         errAbort("Expecting sequence name after '>' line %d of %s", lf->lineIx, lf->fileName);
00456     strncpy(name, line, sizeof(name));
00457     name[sizeof(name)-1] = '\0'; /* Just to make sure name is NULL terminated. */
00458     }
00459 else
00460     {
00461     errAbort("Expecting '>' line %d of %s", lf->lineIx, lf->fileName);
00462     }
00463 /* Read until next '>' */
00464 for (;;)
00465     {
00466     if (!lineFileNext(lf, &line, &lineSize))
00467         break;
00468     if (line[0] == '>')
00469         {
00470         lineFileReuse(lf);
00471         break;
00472         }
00473     if (bufIx + lineSize >= faFastBufSize)
00474         expandFaFastBuf(bufIx, lineSize);
00475     for (i=0; i<lineSize; ++i)
00476         {
00477         c = line[i];
00478         if (isalpha(c) || c == '-')
00479             faFastBuf[bufIx++] = c;
00480         }
00481     }
00482 if (bufIx >= faFastBufSize)
00483     expandFaFastBuf(bufIx, 0);
00484 faFastBuf[bufIx] = 0;
00485 *retDna = faFastBuf;
00486 *retSize = bufIx;
00487 *retName = name;
00488 if (bufIx == 0)
00489     {
00490     warn("Invalid fasta format: sequence size == 0 for element %s",name);
00491     }
00492 
00493 return TRUE;
00494 }
00495 
00496 void faToProtein(char *poly, int size)
00497 /* Convert possibly mixed-case protein to upper case.  Also
00498  * convert any strange characters to 'X'.  Does not change size.
00499  * of sequence. */
00500 {
00501 int i;
00502 char c;
00503 dnaUtilOpen();
00504 for (i=0; i<size; ++i)
00505     {
00506     if ((c = aaChars[(int)poly[i]]) == 0)
00507         c = 'X';
00508     poly[i] = c;
00509     }
00510 }
00511 
00512 void faToDna(char *poly, int size)
00513 /* Convert possibly mixed-case DNA to lower case.  Also turn
00514  * any strange characters to 'n'.  Does not change size.
00515  * of sequence. */
00516 {
00517 int i;
00518 char c;
00519 dnaUtilOpen();
00520 for (i=0; i<size; ++i)
00521     {
00522     if ((c = ntChars[(int)poly[i]]) == 0)
00523         c = 'n';
00524     poly[i] = c;
00525     }
00526 }
00527 
00528 boolean faSomeSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName, boolean isDna)
00529 /* Read in DNA or Peptide FA record. */
00530 {
00531 char *poly;
00532 int size;
00533 
00534 if (!faMixedSpeedReadNext(lf, retDna, retSize, retName))
00535     return FALSE;
00536 size = *retSize;
00537 poly = *retDna;
00538 if (isDna)
00539     faToDna(poly, size);
00540 else
00541     faToProtein(poly, size);
00542 return TRUE;
00543 }
00544 
00545 boolean faPepSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
00546 /* Read in next peptide FA entry as fast as we can.  */
00547 {
00548 return faSomeSpeedReadNext(lf, retDna, retSize, retName, FALSE);
00549 }
00550 
00551 boolean faSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
00552 /* Read in next FA entry as fast as we can. Faster than that old,
00553  * pokey faFastReadNext. Return FALSE at EOF. 
00554  * The returned DNA and name will be overwritten by the next call
00555  * to this function. */
00556 {
00557 return faSomeSpeedReadNext(lf, retDna, retSize, retName, TRUE);
00558 }
00559 
00560 static struct dnaSeq *faReadAllMixableInLf(struct lineFile *lf, 
00561         boolean isDna, boolean mixed)
00562 /* Return list of all sequences from open fa file. 
00563  * Mixed case parameter overrides isDna.  If mixed is false then
00564  * will return DNA in lower case and non-DNA in upper case. */
00565 {
00566 struct dnaSeq *seqList = NULL, *seq;
00567 DNA *dna;
00568 char *name;
00569 int size;
00570 boolean ok;
00571 
00572 for (;;)
00573     {
00574     if (mixed)
00575         ok = faMixedSpeedReadNext(lf, &dna, &size, &name);
00576     else
00577         ok = faSomeSpeedReadNext(lf, &dna, &size, &name, isDna);
00578     if (!ok)
00579         break;
00580     AllocVar(seq);
00581     seq->name = cloneString(name);
00582     seq->size = size;
00583     seq->dna = cloneMem(dna, size+1);
00584     slAddHead(&seqList, seq);
00585     }
00586 slReverse(&seqList);
00587 faFreeFastBuf();
00588 return seqList;
00589 }
00590 
00591 static struct dnaSeq *faReadAllSeqMixable(char *fileName, boolean isDna, boolean mixed)
00592 /* Return list of all sequences in FA file. 
00593  * Mixed case parameter overrides isDna.  If mixed is false then
00594  * will return DNA in lower case and non-DNA in upper case. */
00595 {
00596 struct lineFile *lf = lineFileOpen(fileName, FALSE);
00597 struct dnaSeq *seqList = faReadAllMixableInLf(lf, isDna, mixed);
00598 lineFileClose(&lf);
00599 return seqList;
00600 }
00601 
00602 struct hash *faReadAllIntoHash(char *fileName, enum dnaCase dnaCase)
00603 /* Return hash of all sequences in FA file.  */
00604 {
00605 boolean isDna = (dnaCase == dnaLower);
00606 boolean isMixed = (dnaCase == dnaMixed);
00607 struct dnaSeq *seqList = faReadAllSeqMixable(fileName, isDna, isMixed);
00608 struct hash *hash = hashNew(18);
00609 struct dnaSeq *seq;
00610 for (seq = seqList; seq != NULL; seq = seq->next)
00611     {
00612     if (hashLookup(hash, seq->name))
00613         errAbort("%s duplicated in %s", seq->name, fileName);
00614     hashAdd(hash, seq->name, seq);
00615     }
00616 return hash;
00617 }
00618 
00619 
00620 struct dnaSeq *faReadAllSeq(char *fileName, boolean isDna)
00621 /* Return list of all sequences in FA file. */
00622 {
00623 return faReadAllSeqMixable(fileName, isDna, FALSE);
00624 }
00625 
00626 struct dnaSeq *faReadAllDna(char *fileName)
00627 /* Return list of all DNA sequences in FA file. */
00628 {
00629 return faReadAllSeq(fileName, TRUE);
00630 }
00631 
00632 struct dnaSeq *faReadAllPep(char *fileName)
00633 /* Return list of all peptide sequences in FA file. */
00634 {
00635 return faReadAllSeq(fileName, FALSE);
00636 }
00637 
00638 struct dnaSeq *faReadAllMixed(char *fileName)
00639 /* Read in mixed case fasta file, preserving case. */
00640 {
00641 return faReadAllSeqMixable(fileName, FALSE, TRUE);
00642 }
00643 
00644 struct dnaSeq *faReadAllMixedInLf(struct lineFile *lf)
00645 /* Read in mixed case sequence from open fasta file. */
00646 {
00647 return faReadAllMixableInLf(lf, FALSE, TRUE);
00648 }

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2