inc/gff.h File Reference

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  gffLine
struct  gffGroup
struct  gffSource
struct  gffFeature
struct  gffSeqName
struct  gffGeneId
struct  gffFile

Defines

#define gffTabOut(el, f)   gffOutput(el,f,'\t','\n');
#define gffCommaOut(el, f)   gffOutput(el,f,',',',');

Functions

void gffGroupFree (struct gffGroup **pGroup)
void gffGroupFreeList (struct gffGroup **pList)
void gffGroupLines (struct gffFile *gff)
gffFilegffRead (char *fileName)
gffFilegffFileNew (char *fileName)
void gffFileAdd (struct gffFile *gff, char *fileName, int baseOffset)
void gffFileAddRow (struct gffFile *gff, int baseOffset, char *words[], int wordCount, char *fileName, int lineIx)
void gffFileFree (struct gffFile **pGff)
int gffLineCmp (const void *va, const void *vb)
void gffOutput (struct gffLine *el, FILE *f, char sep, char lastSep)
boolean gffHasGtfGroup (char *line)


Define Documentation

#define gffCommaOut ( el,
 )     gffOutput(el,f,',',',');

Definition at line 136 of file gff.h.

#define gffTabOut ( el,
 )     gffOutput(el,f,'\t','\n');

Definition at line 133 of file gff.h.


Function Documentation

void gffFileAdd ( struct gffFile gff,
char *  fileName,
int  baseOffset 
)

Definition at line 324 of file gff.c.

References chopTabs, gffFile::featureList, lineFile::fileName, gffFile::geneIdList, gffFileAddRow(), gffFile::groupList, lineFileClose(), lineFileNext(), lineFileOpen(), lineFile::lineIx, gffFile::lineList, gffFile::seqList, slReverse(), gffFile::sourceList, and TRUE.

Referenced by gffRead().

00326 {
00327 /* Open file and do basic allocations. */
00328 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00329 char *line, *words[9];
00330 int lineSize, wordCount;
00331 
00332 while (lineFileNext(lf, &line, &lineSize))
00333     {
00334     if (line[0] != '#')
00335         {
00336         wordCount = chopTabs(line, words);
00337         if (wordCount > 0)
00338             gffFileAddRow(gff, baseOffset, words, wordCount, lf->fileName, lf->lineIx);
00339         }
00340     }
00341 slReverse(&gff->lineList);
00342 slReverse(&gff->seqList);
00343 slReverse(&gff->sourceList);
00344 slReverse(&gff->featureList);
00345 slReverse(&gff->groupList);
00346 slReverse(&gff->geneIdList);
00347 lineFileClose(&lf);
00348 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffFileAddRow ( struct gffFile gff,
int  baseOffset,
char *  words[],
int  wordCount,
char *  fileName,
int  lineIx 
)

Definition at line 244 of file gff.c.

References AllocVar, gffLine::end, gffLine::feature, gffFile::featureHash, gffFile::featureList, gffLine::frame, gffSyntaxError(), gffTnName(), gffLine::group, gffFile::groupHash, gffFile::groupList, hashAdd(), hashLookup(), gffFile::isGtf, isGtfGroup(), gffFile::lineList, gffSeqName::name, hashEl::name, gffSource::name, gffFeature::name, gffGroup::name, parseGtfEnd(), gffLine::score, gffLine::seq, gffGroup::seq, gffFile::seqHash, gffFile::seqList, slAddHead, gffLine::source, gffGroup::source, gffFile::sourceHash, gffFile::sourceList, gffLine::start, gffLine::strand, TRUE, and gffFile::typeKnown.

Referenced by gffFileAdd().

00247 {
00248 struct hashEl *hel;
00249 struct gffLine *gl;
00250 
00251 if (wordCount < 8)
00252     gffSyntaxError(fileName, lineIx, "Word count less than 8 ");
00253 AllocVar(gl);
00254 
00255 if ((hel = hashLookup(gff->seqHash, words[0])) == NULL)
00256     {
00257     struct gffSeqName *el;
00258     AllocVar(el);
00259     hel = hashAdd(gff->seqHash, words[0], el);
00260     el->name = hel->name;
00261     slAddHead(&gff->seqList, el);
00262     }
00263 gl->seq = hel->name;
00264 
00265 if ((hel = hashLookup(gff->sourceHash, words[1])) == NULL)
00266     {
00267     struct gffSource *el;
00268     AllocVar(el);
00269     hel = hashAdd(gff->sourceHash, words[1], el);
00270     el->name = hel->name;
00271     slAddHead(&gff->sourceList, el);
00272     }
00273 gl->source = hel->name;
00274 
00275 if ((hel = hashLookup(gff->featureHash, words[2])) == NULL)
00276     {
00277     struct gffFeature *el;
00278     AllocVar(el);
00279     hel = hashAdd(gff->featureHash, words[2], el);
00280     el->name = hel->name;
00281     slAddHead(&gff->featureList, el);
00282     }
00283 gl->feature = hel->name;
00284 
00285 if (!isdigit(words[3][0]) || !isdigit(words[4][0]))
00286    gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number ");        
00287 gl->start = atoi(words[3])-1 + baseOffset;
00288 gl->end = atoi(words[4]) + baseOffset;
00289 gl->score = atof(words[5]);
00290 gl->strand = words[6][0];
00291 gl->frame = words[7][0];
00292 
00293 if (wordCount >= 9)
00294     {
00295     if (!gff->typeKnown)
00296         {
00297         gff->typeKnown = TRUE;
00298         gff->isGtf = isGtfGroup(words[8]);
00299         }
00300     if (gff->isGtf)
00301         {
00302         parseGtfEnd(words[8], gff, gl, fileName, lineIx);
00303         }
00304     else
00305         {
00306         char *tnName = gffTnName(gl->seq, words[8]);
00307         if ((hel = hashLookup(gff->groupHash, tnName)) == NULL)
00308             {
00309             struct gffGroup *group;
00310             AllocVar(group);
00311             hel = hashAdd(gff->groupHash, tnName, group);
00312             group->name = hel->name;
00313             group->seq = gl->seq;
00314             group->source = gl->source;
00315             slAddHead(&gff->groupList, group);
00316             }
00317         gl->group = hel->name;
00318         }
00319     }
00320 slAddHead(&gff->lineList, gl);
00321 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffFileFree ( struct gffFile **  pGff  ) 

Definition at line 39 of file gff.c.

References gffFile::exonHash, gffFile::featureHash, gffFile::featureList, gffFile::fileName, freeHash(), freeMem(), freez(), gffFile::geneIdHash, gffFile::geneIdList, gffGroupFreeList(), gffFile::groupHash, gffFile::groupList, gffFile::intronStatusHash, gffFile::lineList, gffFile::proteinIdHash, gffFile::seqHash, gffFile::seqList, slFreeList(), gffFile::sourceHash, and gffFile::sourceList.

00041 {
00042 struct gffFile *gff;
00043 if ((gff = *pGff) != NULL)
00044     {
00045     freeMem(gff->fileName);
00046     freeHash(&gff->seqHash);
00047     freeHash(&gff->sourceHash);
00048     freeHash(&gff->featureHash);
00049     freeHash(&gff->groupHash);
00050     freeHash(&gff->geneIdHash);
00051     freeHash(&gff->exonHash);
00052     freeHash(&gff->intronStatusHash);
00053     freeHash(&gff->proteinIdHash);
00054     slFreeList(&gff->lineList);
00055     slFreeList(&gff->seqList);
00056     slFreeList(&gff->sourceList);
00057     slFreeList(&gff->featureList);
00058     slFreeList(&gff->geneIdList);
00059     gffGroupFreeList(&gff->groupList);
00060     freez(pGff);
00061     }
00062 }

Here is the call graph for this function:

struct gffFile* gffFileNew ( char *  fileName  )  [read]

Definition at line 350 of file gff.c.

References AllocVar, cloneString(), gffFile::exonHash, gffFile::featureHash, gffFile::fileName, gffFile::geneIdHash, gffFile::groupHash, gffFile::intronStatusHash, newHash(), gffFile::proteinIdHash, gffFile::seqHash, and gffFile::sourceHash.

Referenced by gffRead().

00352 {
00353 struct gffFile *gff;
00354 AllocVar(gff);
00355 gff->fileName = cloneString(fileName);
00356 gff->seqHash = newHash(6);
00357 gff->sourceHash = newHash(6);
00358 gff->featureHash = newHash(6);
00359 gff->groupHash = newHash(12);
00360 gff->geneIdHash = newHash(12);
00361 gff->exonHash = newHash(16);
00362 gff->intronStatusHash = newHash(4);
00363 gff->proteinIdHash = newHash(12);
00364 return gff;
00365 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffGroupFree ( struct gffGroup **  pGroup  ) 

Definition at line 15 of file gff.c.

References freez(), gffGroup::lineList, and slFreeList().

Referenced by gffGroupFreeList().

00017 {
00018 struct gffGroup *group;
00019 if ((group = *pGroup) != NULL)
00020     {
00021     slFreeList(&group->lineList);
00022     freez(pGroup);
00023     }
00024 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffGroupFreeList ( struct gffGroup **  pList  ) 

Definition at line 26 of file gff.c.

References gffGroupFree(), and gffGroup::next.

Referenced by gffFileFree().

00028 {
00029 struct gffGroup *el, *next;
00030 for (el = *pList; el != NULL; el = next)
00031     {
00032     next = el->next;
00033     gffGroupFree(&el);
00034     }
00035 *pList = NULL;
00036 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gffGroupLines ( struct gffFile gff  ) 

Definition at line 394 of file gff.c.

References getGroupBoundaries(), gffLineCmp(), gffLine::group, gffFile::groupHash, gffFile::groupList, hashLookup(), gffFile::lineList, gffGroup::lineList, gffLine::next, gffGroup::next, nextLine(), slAddHead, slReverse(), slSort(), and hashEl::val.

00397 {
00398 struct gffLine *line, *nextLine;
00399 struct hash *groupHash = gff->groupHash;
00400 char *groupName;
00401 struct gffGroup *group;
00402 struct gffLine *ungroupedLines = NULL;
00403 
00404 for (line = gff->lineList; line != NULL; line = nextLine)
00405     {
00406     nextLine = line->next;
00407     if ((groupName = line->group) != NULL)
00408         {
00409         struct hashEl *hel = hashLookup(groupHash, groupName);
00410         group = hel->val;
00411         slAddHead(&group->lineList, line);
00412         }
00413     else
00414         {
00415         slAddHead(&ungroupedLines, line);
00416         }
00417     }
00418 
00419 /* Restore ungrouped lines to gff->lineList. */
00420 slReverse(&ungroupedLines);
00421 gff->lineList = ungroupedLines;
00422 
00423 /* Restore order of grouped lines and fill in start and end. */
00424 for (group = gff->groupList; group != NULL; group = group->next)
00425     {
00426     slSort(&group->lineList, gffLineCmp);
00427     getGroupBoundaries(group);
00428     }
00429 }

Here is the call graph for this function:

boolean gffHasGtfGroup ( char *  line  ) 

Definition at line 112 of file gff.c.

References chopTabs, cloneString(), FALSE, freeMem(), isGtfGroup(), and TRUE.

00114 {
00115 char *words[10];
00116 char *dupe = cloneString(line);
00117 int wordCt = chopTabs(dupe, words);
00118 boolean isGtf = FALSE;
00119 if (wordCt >= 9) 
00120     if (isGtfGroup(words[8]))
00121         isGtf = TRUE;
00122 freeMem(dupe);
00123 return isGtf;
00124 }

Here is the call graph for this function:

int gffLineCmp ( const void *  va,
const void *  vb 
)

Definition at line 64 of file gff.c.

References gffLine::end, gffLine::seq, and gffLine::start.

Referenced by gffGroupLines().

00066 {
00067 const struct gffLine *a = *((struct gffLine **)va);
00068 const struct gffLine *b = *((struct gffLine **)vb);
00069 int diff;
00070 
00071 /* for overlaping starts, sort by end, genePredFromGroupedGtf() depends on
00072  * this */
00073 diff = strcmp(a->seq, b->seq);
00074 if (diff == 0)
00075     diff = a->start - b->start;
00076 if (diff == 0)
00077     diff = a->end - b->end;
00078 return diff;
00079 }

Here is the caller graph for this function:

void gffOutput ( struct gffLine el,
FILE *  f,
char  sep,
char  lastSep 
)

Definition at line 431 of file gff.c.

References gffLine::end, gffLine::exonId, gffLine::feature, gffLine::frame, gffLine::geneId, gffLine::group, gffLine::score, gffLine::seq, gffLine::source, gffLine::start, and gffLine::strand.

00433 {
00434 if (sep == ',') fputc('"',f);
00435 fprintf(f, "%s", el->seq);
00436 if (sep == ',') fputc('"',f);
00437 fputc(sep,f);
00438 if (sep == ',') fputc('"',f);
00439 fprintf(f, "%s", el->source);
00440 if (sep == ',') fputc('"',f);
00441 fputc(sep,f);
00442 if (sep == ',') fputc('"',f);
00443 fprintf(f, "%s", el->feature);
00444 if (sep == ',') fputc('"',f);
00445 fputc(sep,f);
00446 fprintf(f, "%u", el->start+1);
00447 fputc(sep,f);
00448 fprintf(f, "%u", el->end);
00449 fputc(sep,f);
00450 fprintf(f, "%f", el->score);
00451 fputc(sep,f);
00452 if (sep == ',') fputc('"',f);
00453 fprintf(f, "%c", el->strand);
00454 if (sep == ',') fputc('"',f);
00455 fputc(sep,f);
00456 if (sep == ',') fputc('"',f);
00457 fprintf(f, "%c", el->frame);
00458 if (sep == ',') fputc('"',f);
00459 fputc(sep,f);
00460 if (sep == ',') fputc('"',f);
00461 if (el->geneId != NULL)
00462     fprintf(f, "gene_id %s\"%s%s\"; ",
00463             (sep == ',') ? "\\" : "",
00464             el->geneId,
00465             (sep == ',') ? "\\" : "");
00466 fprintf(f, "transcript_id %s\"%s%s\"; ",
00467         (sep == ',') ? "\\" : "",
00468         el->group,
00469         (sep == ',') ? "\\" : "");
00470 if (el->exonId != NULL)
00471     fprintf(f, "exon_id %s\"%s%s\"; ",
00472             (sep == ',') ? "\\" : "",
00473             el->exonId,
00474             (sep == ',') ? "\\" : "");
00475 if (sep == ',') fputc('"',f);
00476 fputc(lastSep,f);
00477 }

struct gffFile* gffRead ( char *  fileName  )  [read]

Definition at line 367 of file gff.c.

References gffFileAdd(), and gffFileNew().

00369 {
00370 struct gffFile *gff = gffFileNew(fileName);
00371 gffFileAdd(gff, fileName, 0);
00372 return gff;
00373 }

Here is the call graph for this function:


Generated on Tue Dec 25 18:58:45 2007 for blat by  doxygen 1.5.2