inc/gff.h

Go to the documentation of this file.
00001 /*****************************************************************************
00002  * Copyright (C) 2000 Jim Kent.  This source code may be freely used         *
00003  * for personal, academic, and non-profit purposes.  Commercial use          *
00004  * permitted only by explicit agreement with Jim Kent (jim_kent@pacbell.net) *
00005  *****************************************************************************/
00006 /* gff.h Parse a GFF or GTF file. */
00007 
00008 #ifndef GFF_H
00009 #define GFF_H
00010 
00011 struct gffLine
00012 /* A parsed line in a GFF file. */
00013     {
00014     struct gffLine *next;  /* Next line in file */
00015     char *seq;      /* Name of sequence. */
00016     char *source;   /* Program that made this line.  Not allocated here. */
00017     char *feature;  /* Type field. (Intron, CDS, etc). Not allocated here. */
00018     int start;      /* Start of feature in sequence. Starts with 0, not 1 */
00019     int end;        /* End of feature in sequence. End is not included. */
00020     double score;   /* Score. */
00021     char strand;    /* Strand of sequence feature is on. + or - or .*/
00022     char frame;     /* Frame feature is in. 1, 2, 3, or . */
00023     char *group;    /* Group line is in. Not allocated here.  Corresponds to transcript_id in GTF */
00024     char *geneId;    /* gene_id in GTF, NULL in GFF.  Not allocated here. */
00025     char *exonId;       /* exon_id in GTF, NULL in GFF. Not allocated here. */
00026     int exonNumber; /* O in GFF or if missing in GTF.  Otherwise exon number. */
00027     char *intronId;       /* intron_id in GTF, NULL in GFF. Not allocated here. */
00028     char *intronStatus;   /* intron status. Not allocated here. */
00029     char *proteinId;      /* protein_id in GTF, NULL in GFF. Not allocated here. */
00030     };
00031 
00032 struct gffGroup
00033 /* A group of lines in a GFF file (all that share the same group field). */
00034     {
00035     struct gffGroup *next;   /* Next group in file. */
00036     char *name;     /* Name of group. Not allocated here. */
00037     char *seq;      /* Name of sequence. Not allocated here. */
00038     char *source;      /* Name of source program. Not allocated here. */
00039     /* The next three fields are only valid after call to gffGroupLines() */
00040     int start;      /* Start of feature in sequence. Starts with 0, not 1 */
00041     int end;        /* End of feature in sequence. End is not included. */
00042     char strand;    /* Strand of sequence. */
00043     struct gffLine *lineList;  /* List of lines in group. */
00044     };
00045 
00046 struct gffSource
00047 /* A list of sources. */
00048     {
00049     struct gffSource *next; /* Next in list. */
00050     char *name;   /* Name, not allocated here. */
00051     unsigned int id;   /* Database ID (or just 0) */
00052     };
00053 
00054 struct gffFeature
00055 /* A list of types in GFF file. */
00056     {
00057     struct gffFeature *next; /* Next in list. */
00058     char *name;   /* Name, not allocated here. */
00059     };
00060 
00061 struct gffSeqName
00062 /* A list of sequence. */
00063     {
00064     struct gffSeqName *next;  /* Next in list. */
00065     char *name;   /* Name, not allocated here. */
00066     };
00067 
00068 struct gffGeneId
00069 /* A list of genes. */
00070     {
00071     struct gffGeneId *next;  /* Next in list. */
00072     char *name;   /* Name, not allocated here. */
00073     };
00074 
00075 struct gffFile
00076 /* This keeps information on a fully parsed GFF file. */
00077     {
00078     struct gffFile *next;
00079     char *fileName;             /* Name of file (allocated here) */
00080     struct hash *seqHash;       /* A name only hash of the sequence. */
00081     struct hash *sourceHash;    /* A name only hash of gff sources. */
00082     struct hash *featureHash;   /* A name only hash of gff types. */
00083     struct hash *groupHash;     /* Associates group names and gffGroups. */
00084     struct hash *geneIdHash;    /* Hash of all geneIds. */
00085     struct hash *exonHash;       /* Hash of all exonIds. */
00086     struct hash *intronStatusHash;/* Hash of intron statuses. */
00087     struct hash *proteinIdHash;  /* Hash of all proteinIds. */
00088     struct gffLine *lineList;   /* List of lines - lines may be in groupList instead. */
00089     struct gffSeqName *seqList; /* List of sequences in file. */
00090     struct gffSource *sourceList; /* List of all sources in file. */
00091     struct gffFeature *featureList; /* List of all types in file. */
00092     struct gffGroup *groupList; /* A list of groups. */
00093     struct gffGeneId *geneIdList;  /* List of all gene ID's. */
00094     bool isGtf;                 /* Is this a GTF file? */
00095     bool typeKnown;             /* Is 'isGtf' known? */
00096     };
00097 
00098 void gffGroupFree(struct gffGroup **pGroup);
00099 /* Free up a gffGroup including lineList. */
00100 
00101 void gffGroupFreeList(struct gffGroup **pList);
00102 /* Free up a list of gffGroups. */
00103 
00104 void gffGroupLines(struct gffFile *gff);
00105 /* Group lines of gff file together, in process mofing
00106  * gff->lineList to gffGroup->lineList. */
00107 
00108 struct gffFile *gffRead(char *fileName);
00109 /* Create a gffFile structure from a GFF file. */
00110 
00111 struct gffFile *gffFileNew(char *fileName);
00112 /* Create a new gffFile structure. */
00113 
00114 void gffFileAdd(struct gffFile *gff, char *fileName, int baseOffset);
00115 /* Add file to gffFile. */
00116 
00117 void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, 
00118                 char *fileName, int lineIx);
00119 /* Process one row of GFF file (a non-comment line parsed by tabs normally). */
00120 
00121 void gffFileFree(struct gffFile **pGff);
00122 /* Free up a gff file. */
00123 
00124 int gffLineCmp(const void *va, const void *vb);
00125 /* Compare two gffLines (for use in slSort, etc.) . */
00126 
00127 void gffOutput(struct gffLine *el, FILE *f, char sep, char lastSep);
00128 /* Print out GTF.  Separate fields with sep. Follow last field with lastSep. */
00129 
00130 boolean gffHasGtfGroup(char *line);
00131 /* Return TRUE if line has a GTF group field */
00132 
00133 #define gffTabOut(el,f) gffOutput(el,f,'\t','\n');
00134 /* Print out GTF as a line in a tab-separated file. */
00135 
00136 #define gffCommaOut(el,f) gffOutput(el,f,',',',');
00137 /* Print out GTF as a comma separated list including final comma. */
00138 
00139 
00140 #endif /* GFF_H */
00141 

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2