00001 /***************************************************************************** 00002 * Copyright (C) 2000 Jim Kent. This source code may be freely used * 00003 * for personal, academic, and non-profit purposes. Commercial use * 00004 * permitted only by explicit agreement with Jim Kent (jim_kent@pacbell.net) * 00005 *****************************************************************************/ 00006 /* gff.h Parse a GFF or GTF file. */ 00007 00008 #ifndef GFF_H 00009 #define GFF_H 00010 00011 struct gffLine 00012 /* A parsed line in a GFF file. */ 00013 { 00014 struct gffLine *next; /* Next line in file */ 00015 char *seq; /* Name of sequence. */ 00016 char *source; /* Program that made this line. Not allocated here. */ 00017 char *feature; /* Type field. (Intron, CDS, etc). Not allocated here. */ 00018 int start; /* Start of feature in sequence. Starts with 0, not 1 */ 00019 int end; /* End of feature in sequence. End is not included. */ 00020 double score; /* Score. */ 00021 char strand; /* Strand of sequence feature is on. + or - or .*/ 00022 char frame; /* Frame feature is in. 1, 2, 3, or . */ 00023 char *group; /* Group line is in. Not allocated here. Corresponds to transcript_id in GTF */ 00024 char *geneId; /* gene_id in GTF, NULL in GFF. Not allocated here. */ 00025 char *exonId; /* exon_id in GTF, NULL in GFF. Not allocated here. */ 00026 int exonNumber; /* O in GFF or if missing in GTF. Otherwise exon number. */ 00027 char *intronId; /* intron_id in GTF, NULL in GFF. Not allocated here. */ 00028 char *intronStatus; /* intron status. Not allocated here. */ 00029 char *proteinId; /* protein_id in GTF, NULL in GFF. Not allocated here. */ 00030 }; 00031 00032 struct gffGroup 00033 /* A group of lines in a GFF file (all that share the same group field). */ 00034 { 00035 struct gffGroup *next; /* Next group in file. */ 00036 char *name; /* Name of group. Not allocated here. */ 00037 char *seq; /* Name of sequence. Not allocated here. */ 00038 char *source; /* Name of source program. Not allocated here. */ 00039 /* The next three fields are only valid after call to gffGroupLines() */ 00040 int start; /* Start of feature in sequence. Starts with 0, not 1 */ 00041 int end; /* End of feature in sequence. End is not included. */ 00042 char strand; /* Strand of sequence. */ 00043 struct gffLine *lineList; /* List of lines in group. */ 00044 }; 00045 00046 struct gffSource 00047 /* A list of sources. */ 00048 { 00049 struct gffSource *next; /* Next in list. */ 00050 char *name; /* Name, not allocated here. */ 00051 unsigned int id; /* Database ID (or just 0) */ 00052 }; 00053 00054 struct gffFeature 00055 /* A list of types in GFF file. */ 00056 { 00057 struct gffFeature *next; /* Next in list. */ 00058 char *name; /* Name, not allocated here. */ 00059 }; 00060 00061 struct gffSeqName 00062 /* A list of sequence. */ 00063 { 00064 struct gffSeqName *next; /* Next in list. */ 00065 char *name; /* Name, not allocated here. */ 00066 }; 00067 00068 struct gffGeneId 00069 /* A list of genes. */ 00070 { 00071 struct gffGeneId *next; /* Next in list. */ 00072 char *name; /* Name, not allocated here. */ 00073 }; 00074 00075 struct gffFile 00076 /* This keeps information on a fully parsed GFF file. */ 00077 { 00078 struct gffFile *next; 00079 char *fileName; /* Name of file (allocated here) */ 00080 struct hash *seqHash; /* A name only hash of the sequence. */ 00081 struct hash *sourceHash; /* A name only hash of gff sources. */ 00082 struct hash *featureHash; /* A name only hash of gff types. */ 00083 struct hash *groupHash; /* Associates group names and gffGroups. */ 00084 struct hash *geneIdHash; /* Hash of all geneIds. */ 00085 struct hash *exonHash; /* Hash of all exonIds. */ 00086 struct hash *intronStatusHash;/* Hash of intron statuses. */ 00087 struct hash *proteinIdHash; /* Hash of all proteinIds. */ 00088 struct gffLine *lineList; /* List of lines - lines may be in groupList instead. */ 00089 struct gffSeqName *seqList; /* List of sequences in file. */ 00090 struct gffSource *sourceList; /* List of all sources in file. */ 00091 struct gffFeature *featureList; /* List of all types in file. */ 00092 struct gffGroup *groupList; /* A list of groups. */ 00093 struct gffGeneId *geneIdList; /* List of all gene ID's. */ 00094 bool isGtf; /* Is this a GTF file? */ 00095 bool typeKnown; /* Is 'isGtf' known? */ 00096 }; 00097 00098 void gffGroupFree(struct gffGroup **pGroup); 00099 /* Free up a gffGroup including lineList. */ 00100 00101 void gffGroupFreeList(struct gffGroup **pList); 00102 /* Free up a list of gffGroups. */ 00103 00104 void gffGroupLines(struct gffFile *gff); 00105 /* Group lines of gff file together, in process mofing 00106 * gff->lineList to gffGroup->lineList. */ 00107 00108 struct gffFile *gffRead(char *fileName); 00109 /* Create a gffFile structure from a GFF file. */ 00110 00111 struct gffFile *gffFileNew(char *fileName); 00112 /* Create a new gffFile structure. */ 00113 00114 void gffFileAdd(struct gffFile *gff, char *fileName, int baseOffset); 00115 /* Add file to gffFile. */ 00116 00117 void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, 00118 char *fileName, int lineIx); 00119 /* Process one row of GFF file (a non-comment line parsed by tabs normally). */ 00120 00121 void gffFileFree(struct gffFile **pGff); 00122 /* Free up a gff file. */ 00123 00124 int gffLineCmp(const void *va, const void *vb); 00125 /* Compare two gffLines (for use in slSort, etc.) . */ 00126 00127 void gffOutput(struct gffLine *el, FILE *f, char sep, char lastSep); 00128 /* Print out GTF. Separate fields with sep. Follow last field with lastSep. */ 00129 00130 boolean gffHasGtfGroup(char *line); 00131 /* Return TRUE if line has a GTF group field */ 00132 00133 #define gffTabOut(el,f) gffOutput(el,f,'\t','\n'); 00134 /* Print out GTF as a line in a tab-separated file. */ 00135 00136 #define gffCommaOut(el,f) gffOutput(el,f,',',','); 00137 /* Print out GTF as a comma separated list including final comma. */ 00138 00139 00140 #endif /* GFF_H */ 00141
1.5.2