lib/gff.c File Reference

#include "common.h"
#include "hash.h"
#include "linefile.h"
#include "gff.h"
#include "obscure.h"

Include dependency graph for gff.c:

Go to the source code of this file.

Functions

void gffGroupFree (struct gffGroup **pGroup)
void gffGroupFreeList (struct gffGroup **pList)
void gffFileFree (struct gffFile **pGff)
int gffLineCmp (const void *va, const void *vb)
static void gffSyntaxError (char *fileName, int line, char *msg)
static char * gffTnName (char *seqName, char *groupName)
static boolean isGtfGroup (char *group)
boolean gffHasGtfGroup (char *line)
static void readQuotedString (char *fileName, int lineIx, char *in, char *out, char **retNext)
static void parseGtfEnd (char *s, struct gffFile *gff, struct gffLine *gl, char *fileName, int lineIx)
void gffFileAddRow (struct gffFile *gff, int baseOffset, char *words[], int wordCount, char *fileName, int lineIx)
void gffFileAdd (struct gffFile *gff, char *fileName, int baseOffset)
gffFilegffFileNew (char *fileName)
gffFilegffRead (char *fileName)
static void getGroupBoundaries (struct gffGroup *group)
void gffGroupLines (struct gffFile *gff)
void gffOutput (struct gffLine *el, FILE *f, char sep, char lastSep)

Variables

static char const rcsid [] = "$Id: gff.c,v 1.21 2007/02/01 00:43:16 kate Exp $"


Function Documentation

static void getGroupBoundaries ( struct gffGroup group  )  [static]

Definition at line 375 of file gff.c.

References gffLine::end, gffLine::group, gffLine::next, gffLine::start, and gffLine::strand.

Referenced by gffGroupLines().

00377 {
00378 struct gffLine *line;
00379 int start = 0x3fffffff;
00380 int end = -start;
00381 line = group->lineList;
00382 group->strand = line->strand;
00383 for (; line != NULL; line = line->next)
00384     {
00385     if (start > line->start)
00386         start = line->start;
00387     if (end < line->end)
00388         end = line->end;
00389     }
00390 group->start = start;
00391 group->end = end;
00392 }

Here is the caller graph for this function:

void gffFileAdd ( struct gffFile gff,
char *  fileName,
int  baseOffset 
)

Definition at line 324 of file gff.c.

References chopTabs, gffFile::featureList, lineFile::fileName, gffFile::geneIdList, gffFileAddRow(), gffFile::groupList, lineFileClose(), lineFileNext(), lineFileOpen(), lineFile::lineIx, gffFile::lineList, gffFile::seqList, slReverse(), gffFile::sourceList, and TRUE.

Referenced by gffRead().

00326 {
00327 /* Open file and do basic allocations. */
00328 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00329 char *line, *words[9];
00330 int lineSize, wordCount;
00331 
00332 while (lineFileNext(lf, &line, &lineSize))
00333     {
00334     if (line[0] != '#')
00335         {
00336         wordCount = chopTabs(line, words);
00337         if (wordCount > 0)
00338             gffFileAddRow(gff, baseOffset, words, wordCount, lf->fileName, lf->lineIx);
00339         }
00340     }
00341 slReverse(&gff->lineList);
00342 slReverse(&gff->seqList);
00343 slReverse(&gff->sourceList);
00344 slReverse(&gff->featureList);
00345 slReverse(&gff->groupList);
00346 slReverse(&gff->geneIdList);
00347 lineFileClose(&lf);
00348 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffFileAddRow ( struct gffFile gff,
int  baseOffset,
char *  words[],
int  wordCount,
char *  fileName,
int  lineIx 
)

Definition at line 244 of file gff.c.

References AllocVar, gffLine::end, gffLine::feature, gffFile::featureHash, gffFile::featureList, gffLine::frame, gffSyntaxError(), gffTnName(), gffLine::group, gffFile::groupHash, gffFile::groupList, hashAdd(), hashLookup(), gffFile::isGtf, isGtfGroup(), gffFile::lineList, gffGroup::name, gffFeature::name, gffSource::name, hashEl::name, gffSeqName::name, parseGtfEnd(), gffLine::score, gffGroup::seq, gffLine::seq, gffFile::seqHash, gffFile::seqList, slAddHead, gffGroup::source, gffLine::source, gffFile::sourceHash, gffFile::sourceList, gffLine::start, gffLine::strand, TRUE, and gffFile::typeKnown.

Referenced by gffFileAdd().

00247 {
00248 struct hashEl *hel;
00249 struct gffLine *gl;
00250 
00251 if (wordCount < 8)
00252     gffSyntaxError(fileName, lineIx, "Word count less than 8 ");
00253 AllocVar(gl);
00254 
00255 if ((hel = hashLookup(gff->seqHash, words[0])) == NULL)
00256     {
00257     struct gffSeqName *el;
00258     AllocVar(el);
00259     hel = hashAdd(gff->seqHash, words[0], el);
00260     el->name = hel->name;
00261     slAddHead(&gff->seqList, el);
00262     }
00263 gl->seq = hel->name;
00264 
00265 if ((hel = hashLookup(gff->sourceHash, words[1])) == NULL)
00266     {
00267     struct gffSource *el;
00268     AllocVar(el);
00269     hel = hashAdd(gff->sourceHash, words[1], el);
00270     el->name = hel->name;
00271     slAddHead(&gff->sourceList, el);
00272     }
00273 gl->source = hel->name;
00274 
00275 if ((hel = hashLookup(gff->featureHash, words[2])) == NULL)
00276     {
00277     struct gffFeature *el;
00278     AllocVar(el);
00279     hel = hashAdd(gff->featureHash, words[2], el);
00280     el->name = hel->name;
00281     slAddHead(&gff->featureList, el);
00282     }
00283 gl->feature = hel->name;
00284 
00285 if (!isdigit(words[3][0]) || !isdigit(words[4][0]))
00286    gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number ");        
00287 gl->start = atoi(words[3])-1 + baseOffset;
00288 gl->end = atoi(words[4]) + baseOffset;
00289 gl->score = atof(words[5]);
00290 gl->strand = words[6][0];
00291 gl->frame = words[7][0];
00292 
00293 if (wordCount >= 9)
00294     {
00295     if (!gff->typeKnown)
00296         {
00297         gff->typeKnown = TRUE;
00298         gff->isGtf = isGtfGroup(words[8]);
00299         }
00300     if (gff->isGtf)
00301         {
00302         parseGtfEnd(words[8], gff, gl, fileName, lineIx);
00303         }
00304     else
00305         {
00306         char *tnName = gffTnName(gl->seq, words[8]);
00307         if ((hel = hashLookup(gff->groupHash, tnName)) == NULL)
00308             {
00309             struct gffGroup *group;
00310             AllocVar(group);
00311             hel = hashAdd(gff->groupHash, tnName, group);
00312             group->name = hel->name;
00313             group->seq = gl->seq;
00314             group->source = gl->source;
00315             slAddHead(&gff->groupList, group);
00316             }
00317         gl->group = hel->name;
00318         }
00319     }
00320 slAddHead(&gff->lineList, gl);
00321 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffFileFree ( struct gffFile **  pGff  ) 

Definition at line 39 of file gff.c.

References gffFile::exonHash, gffFile::featureHash, gffFile::featureList, gffFile::fileName, freeHash(), freeMem(), freez(), gffFile::geneIdHash, gffFile::geneIdList, gffGroupFreeList(), gffFile::groupHash, gffFile::groupList, gffFile::intronStatusHash, gffFile::lineList, gffFile::proteinIdHash, gffFile::seqHash, gffFile::seqList, slFreeList(), gffFile::sourceHash, and gffFile::sourceList.

00041 {
00042 struct gffFile *gff;
00043 if ((gff = *pGff) != NULL)
00044     {
00045     freeMem(gff->fileName);
00046     freeHash(&gff->seqHash);
00047     freeHash(&gff->sourceHash);
00048     freeHash(&gff->featureHash);
00049     freeHash(&gff->groupHash);
00050     freeHash(&gff->geneIdHash);
00051     freeHash(&gff->exonHash);
00052     freeHash(&gff->intronStatusHash);
00053     freeHash(&gff->proteinIdHash);
00054     slFreeList(&gff->lineList);
00055     slFreeList(&gff->seqList);
00056     slFreeList(&gff->sourceList);
00057     slFreeList(&gff->featureList);
00058     slFreeList(&gff->geneIdList);
00059     gffGroupFreeList(&gff->groupList);
00060     freez(pGff);
00061     }
00062 }

Here is the call graph for this function:

struct gffFile* gffFileNew ( char *  fileName  )  [read]

Definition at line 350 of file gff.c.

References AllocVar, cloneString(), gffFile::exonHash, gffFile::featureHash, gffFile::fileName, gffFile::geneIdHash, gffFile::groupHash, gffFile::intronStatusHash, newHash(), gffFile::proteinIdHash, gffFile::seqHash, and gffFile::sourceHash.

Referenced by gffRead().

00352 {
00353 struct gffFile *gff;
00354 AllocVar(gff);
00355 gff->fileName = cloneString(fileName);
00356 gff->seqHash = newHash(6);
00357 gff->sourceHash = newHash(6);
00358 gff->featureHash = newHash(6);
00359 gff->groupHash = newHash(12);
00360 gff->geneIdHash = newHash(12);
00361 gff->exonHash = newHash(16);
00362 gff->intronStatusHash = newHash(4);
00363 gff->proteinIdHash = newHash(12);
00364 return gff;
00365 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffGroupFree ( struct gffGroup **  pGroup  ) 

Definition at line 15 of file gff.c.

References freez(), gffGroup::lineList, and slFreeList().

Referenced by gffGroupFreeList().

00017 {
00018 struct gffGroup *group;
00019 if ((group = *pGroup) != NULL)
00020     {
00021     slFreeList(&group->lineList);
00022     freez(pGroup);
00023     }
00024 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffGroupFreeList ( struct gffGroup **  pList  ) 

Definition at line 26 of file gff.c.

References gffGroupFree(), and gffGroup::next.

Referenced by gffFileFree().

00028 {
00029 struct gffGroup *el, *next;
00030 for (el = *pList; el != NULL; el = next)
00031     {
00032     next = el->next;
00033     gffGroupFree(&el);
00034     }
00035 *pList = NULL;
00036 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffGroupLines ( struct gffFile gff  ) 

Definition at line 394 of file gff.c.

References getGroupBoundaries(), gffLineCmp(), gffLine::group, gffFile::groupHash, gffFile::groupList, hashLookup(), gffGroup::lineList, gffFile::lineList, gffGroup::next, gffLine::next, nextLine(), slAddHead, slReverse(), slSort(), and hashEl::val.

00397 {
00398 struct gffLine *line, *nextLine;
00399 struct hash *groupHash = gff->groupHash;
00400 char *groupName;
00401 struct gffGroup *group;
00402 struct gffLine *ungroupedLines = NULL;
00403 
00404 for (line = gff->lineList; line != NULL; line = nextLine)
00405     {
00406     nextLine = line->next;
00407     if ((groupName = line->group) != NULL)
00408         {
00409         struct hashEl *hel = hashLookup(groupHash, groupName);
00410         group = hel->val;
00411         slAddHead(&group->lineList, line);
00412         }
00413     else
00414         {
00415         slAddHead(&ungroupedLines, line);
00416         }
00417     }
00418 
00419 /* Restore ungrouped lines to gff->lineList. */
00420 slReverse(&ungroupedLines);
00421 gff->lineList = ungroupedLines;
00422 
00423 /* Restore order of grouped lines and fill in start and end. */
00424 for (group = gff->groupList; group != NULL; group = group->next)
00425     {
00426     slSort(&group->lineList, gffLineCmp);
00427     getGroupBoundaries(group);
00428     }
00429 }

Here is the call graph for this function:

boolean gffHasGtfGroup ( char *  line  ) 

Definition at line 112 of file gff.c.

References chopTabs, cloneString(), FALSE, freeMem(), isGtfGroup(), and TRUE.

00114 {
00115 char *words[10];
00116 char *dupe = cloneString(line);
00117 int wordCt = chopTabs(dupe, words);
00118 boolean isGtf = FALSE;
00119 if (wordCt >= 9) 
00120     if (isGtfGroup(words[8]))
00121         isGtf = TRUE;
00122 freeMem(dupe);
00123 return isGtf;
00124 }

Here is the call graph for this function:

int gffLineCmp ( const void *  va,
const void *  vb 
)

Definition at line 64 of file gff.c.

References gffLine::end, gffLine::seq, and gffLine::start.

Referenced by gffGroupLines().

00066 {
00067 const struct gffLine *a = *((struct gffLine **)va);
00068 const struct gffLine *b = *((struct gffLine **)vb);
00069 int diff;
00070 
00071 /* for overlaping starts, sort by end, genePredFromGroupedGtf() depends on
00072  * this */
00073 diff = strcmp(a->seq, b->seq);
00074 if (diff == 0)
00075     diff = a->start - b->start;
00076 if (diff == 0)
00077     diff = a->end - b->end;
00078 return diff;
00079 }

Here is the caller graph for this function:

void gffOutput ( struct gffLine el,
FILE *  f,
char  sep,
char  lastSep 
)

Definition at line 431 of file gff.c.

References gffLine::end, gffLine::exonId, gffLine::feature, gffLine::frame, gffLine::geneId, gffLine::group, gffLine::score, gffLine::seq, gffLine::source, gffLine::start, and gffLine::strand.

00433 {
00434 if (sep == ',') fputc('"',f);
00435 fprintf(f, "%s", el->seq);
00436 if (sep == ',') fputc('"',f);
00437 fputc(sep,f);
00438 if (sep == ',') fputc('"',f);
00439 fprintf(f, "%s", el->source);
00440 if (sep == ',') fputc('"',f);
00441 fputc(sep,f);
00442 if (sep == ',') fputc('"',f);
00443 fprintf(f, "%s", el->feature);
00444 if (sep == ',') fputc('"',f);
00445 fputc(sep,f);
00446 fprintf(f, "%u", el->start+1);
00447 fputc(sep,f);
00448 fprintf(f, "%u", el->end);
00449 fputc(sep,f);
00450 fprintf(f, "%f", el->score);
00451 fputc(sep,f);
00452 if (sep == ',') fputc('"',f);
00453 fprintf(f, "%c", el->strand);
00454 if (sep == ',') fputc('"',f);
00455 fputc(sep,f);
00456 if (sep == ',') fputc('"',f);
00457 fprintf(f, "%c", el->frame);
00458 if (sep == ',') fputc('"',f);
00459 fputc(sep,f);
00460 if (sep == ',') fputc('"',f);
00461 if (el->geneId != NULL)
00462     fprintf(f, "gene_id %s\"%s%s\"; ",
00463             (sep == ',') ? "\\" : "",
00464             el->geneId,
00465             (sep == ',') ? "\\" : "");
00466 fprintf(f, "transcript_id %s\"%s%s\"; ",
00467         (sep == ',') ? "\\" : "",
00468         el->group,
00469         (sep == ',') ? "\\" : "");
00470 if (el->exonId != NULL)
00471     fprintf(f, "exon_id %s\"%s%s\"; ",
00472             (sep == ',') ? "\\" : "",
00473             el->exonId,
00474             (sep == ',') ? "\\" : "");
00475 if (sep == ',') fputc('"',f);
00476 fputc(lastSep,f);
00477 }

struct gffFile* gffRead ( char *  fileName  )  [read]

Definition at line 367 of file gff.c.

References gffFileAdd(), and gffFileNew().

00369 {
00370 struct gffFile *gff = gffFileNew(fileName);
00371 gffFileAdd(gff, fileName, 0);
00372 return gff;
00373 }

Here is the call graph for this function:

static void gffSyntaxError ( char *  fileName,
int  line,
char *  msg 
) [static]

Definition at line 82 of file gff.c.

References errAbort().

Referenced by gffFileAddRow().

00084 {
00085 errAbort("%s Bad line %d of %s:\n", msg, line, fileName);
00086 }

Here is the call graph for this function:

Here is the caller graph for this function:

static char* gffTnName ( char *  seqName,
char *  groupName 
) [static]

Definition at line 88 of file gff.c.

References startsWith().

Referenced by gffFileAddRow().

00090 {
00091 static char nameBuf[512];
00092 if (startsWith("gene-", groupName))
00093     groupName += 5;
00094 if (startsWith("cc_", groupName))
00095     groupName += 3;
00096 strcpy(nameBuf, groupName);
00097 return nameBuf;
00098 }

Here is the call graph for this function:

Here is the caller graph for this function:

static boolean isGtfGroup ( char *  group  )  [static]

Definition at line 100 of file gff.c.

References countChars(), FALSE, and TRUE.

Referenced by gffFileAddRow(), and gffHasGtfGroup().

00102 {
00103 if (strstr(group, "gene_id") == NULL)
00104     return FALSE;
00105 if (countChars(group, '"') >= 2)
00106     return TRUE;
00107 if (strstr(group, "transcript_id") != NULL)
00108     return TRUE;
00109 return FALSE;
00110 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void parseGtfEnd ( char *  s,
struct gffFile gff,
struct gffLine gl,
char *  fileName,
int  lineIx 
) [static]

Definition at line 133 of file gff.c.

References AllocVar, cloneString(), errAbort(), gffFile::exonHash, gffLine::exonId, gffLine::exonNumber, FALSE, gffLine::geneId, gffFile::geneIdHash, gffFile::geneIdList, gffLine::group, gffFile::groupHash, gffFile::groupList, hashAdd(), hashLookup(), gffLine::intronId, gffLine::intronStatus, gffFile::intronStatusHash, gffGroup::name, gffGeneId::name, hashEl::name, nextWord(), gffLine::proteinId, gffFile::proteinIdHash, readQuotedString(), sameString, gffLine::seq, skipLeadingSpaces(), slAddHead, gffLine::source, TRUE, hashEl::val, and warn().

Referenced by gffFileAddRow().

00137 {
00138 char *type, *val;
00139 struct hashEl *hel;
00140 bool gotSemi;
00141 
00142 for (;;)
00143    {
00144    gotSemi = FALSE;
00145    if ((type = nextWord(&s)) == NULL)
00146        break;
00147    s = skipLeadingSpaces(s);
00148    if (NULL == s || s[0] == 0)
00149        errAbort("Unpaired type/val on end of gtf line %d of %s", lineIx, fileName);
00150    if (s[0] == '"' || s[0] == '\'')
00151        {
00152        val = s;
00153        readQuotedString(fileName, lineIx, s, val, &s);
00154        }
00155    else
00156        {
00157        int len;
00158        val = nextWord(&s);
00159        len = strlen(val) - 1;
00160        if (val[len] == ';')
00161            {
00162            val[len] = 0;
00163            len -= 1;
00164            gotSemi = TRUE;
00165            }
00166        if (len < 0)
00167            errAbort("Empty value for %s line %d of %s", type, lineIx, fileName);
00168        }
00169    if (s != NULL && !gotSemi)
00170       {
00171       s = strchr(s, ';');
00172       if (s != NULL)
00173          ++s;
00174       }
00175    /* only use the first occurance of gene_id and transcript_id */
00176    if (sameString("gene_id", type) && (gl->geneId == NULL))
00177        {
00178        struct gffGeneId *gg;
00179        if ((hel = hashLookup(gff->geneIdHash, val)) == NULL)
00180            {
00181            AllocVar(gg);
00182            hel = hashAdd(gff->geneIdHash, val, gg);
00183            gg->name = hel->name;
00184            slAddHead(&gff->geneIdList, gg);
00185            }
00186         else
00187            {
00188            gg = hel->val;
00189            }
00190        gl->geneId = gg->name;
00191        }
00192    else if (sameString("transcript_id", type) && (gl->group == NULL))
00193        {
00194        struct gffGroup *gg;
00195        if ((hel = hashLookup(gff->groupHash, val)) == NULL)
00196            {
00197            AllocVar(gg);
00198            hel = hashAdd(gff->groupHash, val, gg);
00199            gg->name = hel->name;
00200            gg->seq = gl->seq;
00201            gg->source = gl->source;
00202            slAddHead(&gff->groupList, gg);
00203            }
00204         else
00205            {
00206            gg = hel->val;
00207            }
00208        gl->group = gg->name;
00209        }
00210    else if (sameString("exon_id", type))
00211        {
00212        if ((hel = hashLookup(gff->exonHash, val)) == NULL)
00213            hel = hashAdd(gff->exonHash, val, NULL);
00214        gl->exonId = hel->val;
00215        }
00216    else if (sameString("exon_number", type))
00217        {
00218        if (!isdigit(val[0]))
00219            errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName);
00220        gl->exonNumber = atoi(val);
00221        }
00222    else if (sameString("intron_id", type))
00223        gl->intronId = cloneString(val);
00224    else if (sameString("intron_status", type))
00225        {
00226        if ((hel = hashLookup(gff->intronStatusHash, val)) == NULL)
00227            hel = hashAdd(gff->intronStatusHash, val, NULL);
00228        gl->intronStatus = hel->name;
00229        }
00230    else if (sameString("protein_id", type))
00231        {
00232        if ((hel = hashLookup(gff->proteinIdHash, val)) == NULL)
00233            hel = hashAdd(gff->proteinIdHash, val, NULL);
00234        gl->proteinId = hel->name;
00235        }
00236    }
00237 if (gl->group == NULL)
00238     {
00239     if (gl->geneId == NULL)
00240         warn("No gene_id or transcript_id line %d of %s", lineIx, fileName);
00241     }
00242 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void readQuotedString ( char *  fileName,
int  lineIx,
char *  in,
char *  out,
char **  retNext 
) [static]

Definition at line 126 of file gff.c.

References errAbort(), and parseQuotedString().

Referenced by parseGtfEnd().

00128 {
00129 if (!parseQuotedString(in, out, retNext))
00130     errAbort("Line %d of %s\n", lineIx, fileName);
00131 }

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

char const rcsid[] = "$Id: gff.c,v 1.21 2007/02/01 00:43:16 kate Exp $" [static]

Definition at line 13 of file gff.c.


Generated on Tue Dec 25 19:51:56 2007 for blat by  doxygen 1.5.2