inc/fa.h

Go to the documentation of this file.
00001 /* Routines for reading and writing fasta format sequence files.
00002  *
00003  * This file is copyright 2002 Jim Kent, but license is hereby
00004  * granted for all use - public, private or commercial. */
00005 
00006 #ifndef FA_H
00007 #define FA_H
00008 
00009 #ifndef DNASEQ_H
00010 #include "dnaseq.h"
00011 #endif
00012 
00013 #ifndef LINEFILE_H
00014 #include "linefile.h"
00015 #endif
00016 
00017 struct dnaSeq *faReadDna(char *fileName);
00018 /* Open fa file and read a single dna sequence from it. */
00019 
00020 aaSeq *faReadAa(char *fileName);
00021 /* Open fa file and read a single dna sequence from it. */
00022 
00023 bioSeq *faReadSeq(char *fileName, boolean isDna);
00024 /* Read a dna or protein sequence. */
00025 
00026 struct dnaSeq *faReadAllDna(char *fileName);
00027 /* Return list of all DNA sequences in FA file. */
00028 
00029 struct dnaSeq *faReadAllPep(char *fileName);
00030 /* Return list of all Peptide sequences in FA file. */
00031 
00032 struct dnaSeq *faReadAllSeq(char *fileName, boolean isDna);
00033 /* Return list of all sequences in FA file. */
00034 
00035 struct dnaSeq *faReadAllMixed(char *fileName);
00036 /* Read in mixed case fasta file, preserving case. */
00037 
00038 struct hash *faReadAllIntoHash(char *fileName, enum dnaCase dnaCase);
00039 /* Return hash of all sequences in FA file.  */
00040 
00041 struct dnaSeq *faReadAllMixedInLf(struct lineFile *lf);
00042 /* Read in mixed case sequence from open fasta file. */
00043 
00044 struct dnaSeq *faReadOneDnaSeq(FILE *f, char *name, boolean mustStartWithSign);
00045 /* Read one sequence from FA file. Assumes positioned at or before
00046  * the '>' at start of sequence. */  
00047 
00048 boolean faReadNext(FILE *f, char *defaultName, boolean mustStartWithComment, 
00049     char **retCommentLine, struct dnaSeq **retSeq);
00050 /* Read next sequence from .fa file. Return sequence in retSeq.  If retCommentLine is non-null
00051  * return the '>' line in retCommentLine.   The whole thing returns FALSE at end of file. 
00052  * Assumes positioned at or before the '>' at start of sequence.  File must have been
00053  * opened in binary mode! Note: sequence is mapped to lower case */
00054 
00055 boolean faReadMixedNext(FILE *f, boolean preserveCase, char *defaultName, 
00056     boolean mustStartWithComment, char **retCommentLine, struct dnaSeq **retSeq);
00057 /* Read next sequence from .fa file. Return sequence in retSeq.  If retCommentLine is non-null
00058  * return the '>' line in retCommentLine.   The whole thing returns FALSE at end of file. Provides flag for preserving case in sequence */
00059 
00060 struct dnaSeq *faFromMemText(char *text);
00061 /* Return a sequence from a .fa file that's been read into
00062  * a string in memory. This cannabalizes text, which should
00063  * be allocated with needMem.  This buffer becomes part of
00064  * the returned dnaSeq, which may be freed normally with
00065  * freeDnaSeq. */
00066 
00067 bioSeq *faSeqFromMemText(char *text, boolean isDna);
00068 /* Convert fa in memory to bioSeq. This cannabalizes text
00069  * as does faFromMemText above. */
00070 
00071 bioSeq *faNextSeqFromMemText(char **pText, boolean isDna);
00072 /* Convert fa in memory to bioSeq.  Update *pText to point to next
00073  * record.  Returns NULL when no more sequences left. */
00074 
00075 bioSeq *faNextSeqFromMemTextRaw(char **pText);
00076 /* Same as faNextSeqFromMemText, but will leave in 
00077  * letters even if they aren't in DNA or protein alphabed. */
00078 
00079 bioSeq *faSeqListFromMemText(char *text, boolean isDna);
00080 /* Convert fa's in memory into list of dnaSeqs. */
00081 
00082 bioSeq *faSeqListFromMemTextRaw(char *text);
00083 /* Convert fa's in memory into list of dnaSeqs without
00084  * converting chars to N's. */
00085 
00086 boolean faFastReadNext(FILE *f, DNA **retDna, int *retSize, char **retName);
00087 /* Read in next FA entry as fast as we can. Return FALSE at EOF. 
00088  * The returned DNA and name will be overwritten by the next call
00089  * to this function. */
00090 
00091 boolean faSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName);
00092 /* Read in next FA entry as fast as we can. Faster than that old,
00093  * pokey faFastReadNext. Return FALSE at EOF. 
00094  * The returned DNA and name will be overwritten by the next call
00095  * to this function. */
00096 
00097 boolean faPepSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName);
00098 /* Read in next peptide FA entry as fast as we can.  */
00099 
00100 boolean faSomeSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName, boolean isDna);
00101 /* Read in DNA or Peptide FA record. */
00102 
00103 boolean faMixedSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName);
00104 /* Read in DNA or Peptide FA record in mixed case.   Allow any upper or lower case
00105  * letter, or the dash character in. */
00106 
00107 void faToProtein(char *poly, int size);
00108 /* Convert possibly mixed-case protein to upper case.  Also
00109  * convert any strange characters to 'X'.  Does not change size.
00110  * of sequence. */
00111 
00112 void faToDna(char *poly, int size);
00113 /* Convert possibly mixed-case DNA to lower case.  Also turn
00114  * any strange characters to 'n'.  Does not change size.
00115  * of sequence. */
00116 
00117 void faFreeFastBuf();
00118 /* Free up buffers used in fa fast and speedreading. */
00119 
00120 void faWrite(char *fileName, char *startLine, DNA *dna, int dnaSize);
00121 /* Write out FA file or die trying. */
00122 
00123 void faWriteNext(FILE *f, char *startLine, DNA *dna, int dnaSize);
00124 /* Write next sequence to fa file. */
00125 
00126 void faWriteAll(char *fileName, bioSeq *seqList);
00127 /* Write out all sequences in list to file. */
00128 
00129 #endif /* FA_H */

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2