00001 /* Routines for reading and writing fasta format sequence files. 00002 * 00003 * This file is copyright 2002 Jim Kent, but license is hereby 00004 * granted for all use - public, private or commercial. */ 00005 00006 #ifndef FA_H 00007 #define FA_H 00008 00009 #ifndef DNASEQ_H 00010 #include "dnaseq.h" 00011 #endif 00012 00013 #ifndef LINEFILE_H 00014 #include "linefile.h" 00015 #endif 00016 00017 struct dnaSeq *faReadDna(char *fileName); 00018 /* Open fa file and read a single dna sequence from it. */ 00019 00020 aaSeq *faReadAa(char *fileName); 00021 /* Open fa file and read a single dna sequence from it. */ 00022 00023 bioSeq *faReadSeq(char *fileName, boolean isDna); 00024 /* Read a dna or protein sequence. */ 00025 00026 struct dnaSeq *faReadAllDna(char *fileName); 00027 /* Return list of all DNA sequences in FA file. */ 00028 00029 struct dnaSeq *faReadAllPep(char *fileName); 00030 /* Return list of all Peptide sequences in FA file. */ 00031 00032 struct dnaSeq *faReadAllSeq(char *fileName, boolean isDna); 00033 /* Return list of all sequences in FA file. */ 00034 00035 struct dnaSeq *faReadAllMixed(char *fileName); 00036 /* Read in mixed case fasta file, preserving case. */ 00037 00038 struct hash *faReadAllIntoHash(char *fileName, enum dnaCase dnaCase); 00039 /* Return hash of all sequences in FA file. */ 00040 00041 struct dnaSeq *faReadAllMixedInLf(struct lineFile *lf); 00042 /* Read in mixed case sequence from open fasta file. */ 00043 00044 struct dnaSeq *faReadOneDnaSeq(FILE *f, char *name, boolean mustStartWithSign); 00045 /* Read one sequence from FA file. Assumes positioned at or before 00046 * the '>' at start of sequence. */ 00047 00048 boolean faReadNext(FILE *f, char *defaultName, boolean mustStartWithComment, 00049 char **retCommentLine, struct dnaSeq **retSeq); 00050 /* Read next sequence from .fa file. Return sequence in retSeq. If retCommentLine is non-null 00051 * return the '>' line in retCommentLine. The whole thing returns FALSE at end of file. 00052 * Assumes positioned at or before the '>' at start of sequence. File must have been 00053 * opened in binary mode! Note: sequence is mapped to lower case */ 00054 00055 boolean faReadMixedNext(FILE *f, boolean preserveCase, char *defaultName, 00056 boolean mustStartWithComment, char **retCommentLine, struct dnaSeq **retSeq); 00057 /* Read next sequence from .fa file. Return sequence in retSeq. If retCommentLine is non-null 00058 * return the '>' line in retCommentLine. The whole thing returns FALSE at end of file. Provides flag for preserving case in sequence */ 00059 00060 struct dnaSeq *faFromMemText(char *text); 00061 /* Return a sequence from a .fa file that's been read into 00062 * a string in memory. This cannabalizes text, which should 00063 * be allocated with needMem. This buffer becomes part of 00064 * the returned dnaSeq, which may be freed normally with 00065 * freeDnaSeq. */ 00066 00067 bioSeq *faSeqFromMemText(char *text, boolean isDna); 00068 /* Convert fa in memory to bioSeq. This cannabalizes text 00069 * as does faFromMemText above. */ 00070 00071 bioSeq *faNextSeqFromMemText(char **pText, boolean isDna); 00072 /* Convert fa in memory to bioSeq. Update *pText to point to next 00073 * record. Returns NULL when no more sequences left. */ 00074 00075 bioSeq *faNextSeqFromMemTextRaw(char **pText); 00076 /* Same as faNextSeqFromMemText, but will leave in 00077 * letters even if they aren't in DNA or protein alphabed. */ 00078 00079 bioSeq *faSeqListFromMemText(char *text, boolean isDna); 00080 /* Convert fa's in memory into list of dnaSeqs. */ 00081 00082 bioSeq *faSeqListFromMemTextRaw(char *text); 00083 /* Convert fa's in memory into list of dnaSeqs without 00084 * converting chars to N's. */ 00085 00086 boolean faFastReadNext(FILE *f, DNA **retDna, int *retSize, char **retName); 00087 /* Read in next FA entry as fast as we can. Return FALSE at EOF. 00088 * The returned DNA and name will be overwritten by the next call 00089 * to this function. */ 00090 00091 boolean faSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName); 00092 /* Read in next FA entry as fast as we can. Faster than that old, 00093 * pokey faFastReadNext. Return FALSE at EOF. 00094 * The returned DNA and name will be overwritten by the next call 00095 * to this function. */ 00096 00097 boolean faPepSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName); 00098 /* Read in next peptide FA entry as fast as we can. */ 00099 00100 boolean faSomeSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName, boolean isDna); 00101 /* Read in DNA or Peptide FA record. */ 00102 00103 boolean faMixedSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName); 00104 /* Read in DNA or Peptide FA record in mixed case. Allow any upper or lower case 00105 * letter, or the dash character in. */ 00106 00107 void faToProtein(char *poly, int size); 00108 /* Convert possibly mixed-case protein to upper case. Also 00109 * convert any strange characters to 'X'. Does not change size. 00110 * of sequence. */ 00111 00112 void faToDna(char *poly, int size); 00113 /* Convert possibly mixed-case DNA to lower case. Also turn 00114 * any strange characters to 'n'. Does not change size. 00115 * of sequence. */ 00116 00117 void faFreeFastBuf(); 00118 /* Free up buffers used in fa fast and speedreading. */ 00119 00120 void faWrite(char *fileName, char *startLine, DNA *dna, int dnaSize); 00121 /* Write out FA file or die trying. */ 00122 00123 void faWriteNext(FILE *f, char *startLine, DNA *dna, int dnaSize); 00124 /* Write next sequence to fa file. */ 00125 00126 void faWriteAll(char *fileName, bioSeq *seqList); 00127 /* Write out all sequences in list to file. */ 00128 00129 #endif /* FA_H */
1.5.2