#include "common.h"#include "hash.h"#include "linefile.h"#include "gff.h"#include "obscure.h"Include dependency graph for gff.c:

Go to the source code of this file.
Functions | |
| void | gffGroupFree (struct gffGroup **pGroup) |
| void | gffGroupFreeList (struct gffGroup **pList) |
| void | gffFileFree (struct gffFile **pGff) |
| int | gffLineCmp (const void *va, const void *vb) |
| static void | gffSyntaxError (char *fileName, int line, char *msg) |
| static char * | gffTnName (char *seqName, char *groupName) |
| static boolean | isGtfGroup (char *group) |
| boolean | gffHasGtfGroup (char *line) |
| static void | readQuotedString (char *fileName, int lineIx, char *in, char *out, char **retNext) |
| static void | parseGtfEnd (char *s, struct gffFile *gff, struct gffLine *gl, char *fileName, int lineIx) |
| void | gffFileAddRow (struct gffFile *gff, int baseOffset, char *words[], int wordCount, char *fileName, int lineIx) |
| void | gffFileAdd (struct gffFile *gff, char *fileName, int baseOffset) |
| gffFile * | gffFileNew (char *fileName) |
| gffFile * | gffRead (char *fileName) |
| static void | getGroupBoundaries (struct gffGroup *group) |
| void | gffGroupLines (struct gffFile *gff) |
| void | gffOutput (struct gffLine *el, FILE *f, char sep, char lastSep) |
Variables | |
| static char const | rcsid [] = "$Id: gff.c,v 1.21 2007/02/01 00:43:16 kate Exp $" |
| static void getGroupBoundaries | ( | struct gffGroup * | group | ) | [static] |
Definition at line 375 of file gff.c.
References gffLine::end, gffLine::group, gffLine::next, gffLine::start, and gffLine::strand.
Referenced by gffGroupLines().
00377 { 00378 struct gffLine *line; 00379 int start = 0x3fffffff; 00380 int end = -start; 00381 line = group->lineList; 00382 group->strand = line->strand; 00383 for (; line != NULL; line = line->next) 00384 { 00385 if (start > line->start) 00386 start = line->start; 00387 if (end < line->end) 00388 end = line->end; 00389 } 00390 group->start = start; 00391 group->end = end; 00392 }
Here is the caller graph for this function:

| void gffFileAdd | ( | struct gffFile * | gff, | |
| char * | fileName, | |||
| int | baseOffset | |||
| ) |
Definition at line 324 of file gff.c.
References chopTabs, gffFile::featureList, lineFile::fileName, gffFile::geneIdList, gffFileAddRow(), gffFile::groupList, lineFileClose(), lineFileNext(), lineFileOpen(), lineFile::lineIx, gffFile::lineList, gffFile::seqList, slReverse(), gffFile::sourceList, and TRUE.
Referenced by gffRead().
00326 { 00327 /* Open file and do basic allocations. */ 00328 struct lineFile *lf = lineFileOpen(fileName, TRUE); 00329 char *line, *words[9]; 00330 int lineSize, wordCount; 00331 00332 while (lineFileNext(lf, &line, &lineSize)) 00333 { 00334 if (line[0] != '#') 00335 { 00336 wordCount = chopTabs(line, words); 00337 if (wordCount > 0) 00338 gffFileAddRow(gff, baseOffset, words, wordCount, lf->fileName, lf->lineIx); 00339 } 00340 } 00341 slReverse(&gff->lineList); 00342 slReverse(&gff->seqList); 00343 slReverse(&gff->sourceList); 00344 slReverse(&gff->featureList); 00345 slReverse(&gff->groupList); 00346 slReverse(&gff->geneIdList); 00347 lineFileClose(&lf); 00348 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void gffFileAddRow | ( | struct gffFile * | gff, | |
| int | baseOffset, | |||
| char * | words[], | |||
| int | wordCount, | |||
| char * | fileName, | |||
| int | lineIx | |||
| ) |
Definition at line 244 of file gff.c.
References AllocVar, gffLine::end, gffLine::feature, gffFile::featureHash, gffFile::featureList, gffLine::frame, gffSyntaxError(), gffTnName(), gffLine::group, gffFile::groupHash, gffFile::groupList, hashAdd(), hashLookup(), gffFile::isGtf, isGtfGroup(), gffFile::lineList, gffGroup::name, gffFeature::name, gffSource::name, hashEl::name, gffSeqName::name, parseGtfEnd(), gffLine::score, gffGroup::seq, gffLine::seq, gffFile::seqHash, gffFile::seqList, slAddHead, gffGroup::source, gffLine::source, gffFile::sourceHash, gffFile::sourceList, gffLine::start, gffLine::strand, TRUE, and gffFile::typeKnown.
Referenced by gffFileAdd().
00247 { 00248 struct hashEl *hel; 00249 struct gffLine *gl; 00250 00251 if (wordCount < 8) 00252 gffSyntaxError(fileName, lineIx, "Word count less than 8 "); 00253 AllocVar(gl); 00254 00255 if ((hel = hashLookup(gff->seqHash, words[0])) == NULL) 00256 { 00257 struct gffSeqName *el; 00258 AllocVar(el); 00259 hel = hashAdd(gff->seqHash, words[0], el); 00260 el->name = hel->name; 00261 slAddHead(&gff->seqList, el); 00262 } 00263 gl->seq = hel->name; 00264 00265 if ((hel = hashLookup(gff->sourceHash, words[1])) == NULL) 00266 { 00267 struct gffSource *el; 00268 AllocVar(el); 00269 hel = hashAdd(gff->sourceHash, words[1], el); 00270 el->name = hel->name; 00271 slAddHead(&gff->sourceList, el); 00272 } 00273 gl->source = hel->name; 00274 00275 if ((hel = hashLookup(gff->featureHash, words[2])) == NULL) 00276 { 00277 struct gffFeature *el; 00278 AllocVar(el); 00279 hel = hashAdd(gff->featureHash, words[2], el); 00280 el->name = hel->name; 00281 slAddHead(&gff->featureList, el); 00282 } 00283 gl->feature = hel->name; 00284 00285 if (!isdigit(words[3][0]) || !isdigit(words[4][0])) 00286 gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number "); 00287 gl->start = atoi(words[3])-1 + baseOffset; 00288 gl->end = atoi(words[4]) + baseOffset; 00289 gl->score = atof(words[5]); 00290 gl->strand = words[6][0]; 00291 gl->frame = words[7][0]; 00292 00293 if (wordCount >= 9) 00294 { 00295 if (!gff->typeKnown) 00296 { 00297 gff->typeKnown = TRUE; 00298 gff->isGtf = isGtfGroup(words[8]); 00299 } 00300 if (gff->isGtf) 00301 { 00302 parseGtfEnd(words[8], gff, gl, fileName, lineIx); 00303 } 00304 else 00305 { 00306 char *tnName = gffTnName(gl->seq, words[8]); 00307 if ((hel = hashLookup(gff->groupHash, tnName)) == NULL) 00308 { 00309 struct gffGroup *group; 00310 AllocVar(group); 00311 hel = hashAdd(gff->groupHash, tnName, group); 00312 group->name = hel->name; 00313 group->seq = gl->seq; 00314 group->source = gl->source; 00315 slAddHead(&gff->groupList, group); 00316 } 00317 gl->group = hel->name; 00318 } 00319 } 00320 slAddHead(&gff->lineList, gl); 00321 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void gffFileFree | ( | struct gffFile ** | pGff | ) |
Definition at line 39 of file gff.c.
References gffFile::exonHash, gffFile::featureHash, gffFile::featureList, gffFile::fileName, freeHash(), freeMem(), freez(), gffFile::geneIdHash, gffFile::geneIdList, gffGroupFreeList(), gffFile::groupHash, gffFile::groupList, gffFile::intronStatusHash, gffFile::lineList, gffFile::proteinIdHash, gffFile::seqHash, gffFile::seqList, slFreeList(), gffFile::sourceHash, and gffFile::sourceList.
00041 { 00042 struct gffFile *gff; 00043 if ((gff = *pGff) != NULL) 00044 { 00045 freeMem(gff->fileName); 00046 freeHash(&gff->seqHash); 00047 freeHash(&gff->sourceHash); 00048 freeHash(&gff->featureHash); 00049 freeHash(&gff->groupHash); 00050 freeHash(&gff->geneIdHash); 00051 freeHash(&gff->exonHash); 00052 freeHash(&gff->intronStatusHash); 00053 freeHash(&gff->proteinIdHash); 00054 slFreeList(&gff->lineList); 00055 slFreeList(&gff->seqList); 00056 slFreeList(&gff->sourceList); 00057 slFreeList(&gff->featureList); 00058 slFreeList(&gff->geneIdList); 00059 gffGroupFreeList(&gff->groupList); 00060 freez(pGff); 00061 } 00062 }
Here is the call graph for this function:

| struct gffFile* gffFileNew | ( | char * | fileName | ) | [read] |
Definition at line 350 of file gff.c.
References AllocVar, cloneString(), gffFile::exonHash, gffFile::featureHash, gffFile::fileName, gffFile::geneIdHash, gffFile::groupHash, gffFile::intronStatusHash, newHash(), gffFile::proteinIdHash, gffFile::seqHash, and gffFile::sourceHash.
Referenced by gffRead().
00352 { 00353 struct gffFile *gff; 00354 AllocVar(gff); 00355 gff->fileName = cloneString(fileName); 00356 gff->seqHash = newHash(6); 00357 gff->sourceHash = newHash(6); 00358 gff->featureHash = newHash(6); 00359 gff->groupHash = newHash(12); 00360 gff->geneIdHash = newHash(12); 00361 gff->exonHash = newHash(16); 00362 gff->intronStatusHash = newHash(4); 00363 gff->proteinIdHash = newHash(12); 00364 return gff; 00365 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void gffGroupFree | ( | struct gffGroup ** | pGroup | ) |
Definition at line 15 of file gff.c.
References freez(), gffGroup::lineList, and slFreeList().
Referenced by gffGroupFreeList().
00017 { 00018 struct gffGroup *group; 00019 if ((group = *pGroup) != NULL) 00020 { 00021 slFreeList(&group->lineList); 00022 freez(pGroup); 00023 } 00024 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void gffGroupFreeList | ( | struct gffGroup ** | pList | ) |
Definition at line 26 of file gff.c.
References gffGroupFree(), and gffGroup::next.
Referenced by gffFileFree().
00028 { 00029 struct gffGroup *el, *next; 00030 for (el = *pList; el != NULL; el = next) 00031 { 00032 next = el->next; 00033 gffGroupFree(&el); 00034 } 00035 *pList = NULL; 00036 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void gffGroupLines | ( | struct gffFile * | gff | ) |
Definition at line 394 of file gff.c.
References getGroupBoundaries(), gffLineCmp(), gffLine::group, gffFile::groupHash, gffFile::groupList, hashLookup(), gffGroup::lineList, gffFile::lineList, gffGroup::next, gffLine::next, nextLine(), slAddHead, slReverse(), slSort(), and hashEl::val.
00397 { 00398 struct gffLine *line, *nextLine; 00399 struct hash *groupHash = gff->groupHash; 00400 char *groupName; 00401 struct gffGroup *group; 00402 struct gffLine *ungroupedLines = NULL; 00403 00404 for (line = gff->lineList; line != NULL; line = nextLine) 00405 { 00406 nextLine = line->next; 00407 if ((groupName = line->group) != NULL) 00408 { 00409 struct hashEl *hel = hashLookup(groupHash, groupName); 00410 group = hel->val; 00411 slAddHead(&group->lineList, line); 00412 } 00413 else 00414 { 00415 slAddHead(&ungroupedLines, line); 00416 } 00417 } 00418 00419 /* Restore ungrouped lines to gff->lineList. */ 00420 slReverse(&ungroupedLines); 00421 gff->lineList = ungroupedLines; 00422 00423 /* Restore order of grouped lines and fill in start and end. */ 00424 for (group = gff->groupList; group != NULL; group = group->next) 00425 { 00426 slSort(&group->lineList, gffLineCmp); 00427 getGroupBoundaries(group); 00428 } 00429 }
Here is the call graph for this function:

| boolean gffHasGtfGroup | ( | char * | line | ) |
Definition at line 112 of file gff.c.
References chopTabs, cloneString(), FALSE, freeMem(), isGtfGroup(), and TRUE.
00114 { 00115 char *words[10]; 00116 char *dupe = cloneString(line); 00117 int wordCt = chopTabs(dupe, words); 00118 boolean isGtf = FALSE; 00119 if (wordCt >= 9) 00120 if (isGtfGroup(words[8])) 00121 isGtf = TRUE; 00122 freeMem(dupe); 00123 return isGtf; 00124 }
Here is the call graph for this function:

| int gffLineCmp | ( | const void * | va, | |
| const void * | vb | |||
| ) |
Definition at line 64 of file gff.c.
References gffLine::end, gffLine::seq, and gffLine::start.
Referenced by gffGroupLines().
00066 { 00067 const struct gffLine *a = *((struct gffLine **)va); 00068 const struct gffLine *b = *((struct gffLine **)vb); 00069 int diff; 00070 00071 /* for overlaping starts, sort by end, genePredFromGroupedGtf() depends on 00072 * this */ 00073 diff = strcmp(a->seq, b->seq); 00074 if (diff == 0) 00075 diff = a->start - b->start; 00076 if (diff == 0) 00077 diff = a->end - b->end; 00078 return diff; 00079 }
Here is the caller graph for this function:

| void gffOutput | ( | struct gffLine * | el, | |
| FILE * | f, | |||
| char | sep, | |||
| char | lastSep | |||
| ) |
Definition at line 431 of file gff.c.
References gffLine::end, gffLine::exonId, gffLine::feature, gffLine::frame, gffLine::geneId, gffLine::group, gffLine::score, gffLine::seq, gffLine::source, gffLine::start, and gffLine::strand.
00433 { 00434 if (sep == ',') fputc('"',f); 00435 fprintf(f, "%s", el->seq); 00436 if (sep == ',') fputc('"',f); 00437 fputc(sep,f); 00438 if (sep == ',') fputc('"',f); 00439 fprintf(f, "%s", el->source); 00440 if (sep == ',') fputc('"',f); 00441 fputc(sep,f); 00442 if (sep == ',') fputc('"',f); 00443 fprintf(f, "%s", el->feature); 00444 if (sep == ',') fputc('"',f); 00445 fputc(sep,f); 00446 fprintf(f, "%u", el->start+1); 00447 fputc(sep,f); 00448 fprintf(f, "%u", el->end); 00449 fputc(sep,f); 00450 fprintf(f, "%f", el->score); 00451 fputc(sep,f); 00452 if (sep == ',') fputc('"',f); 00453 fprintf(f, "%c", el->strand); 00454 if (sep == ',') fputc('"',f); 00455 fputc(sep,f); 00456 if (sep == ',') fputc('"',f); 00457 fprintf(f, "%c", el->frame); 00458 if (sep == ',') fputc('"',f); 00459 fputc(sep,f); 00460 if (sep == ',') fputc('"',f); 00461 if (el->geneId != NULL) 00462 fprintf(f, "gene_id %s\"%s%s\"; ", 00463 (sep == ',') ? "\\" : "", 00464 el->geneId, 00465 (sep == ',') ? "\\" : ""); 00466 fprintf(f, "transcript_id %s\"%s%s\"; ", 00467 (sep == ',') ? "\\" : "", 00468 el->group, 00469 (sep == ',') ? "\\" : ""); 00470 if (el->exonId != NULL) 00471 fprintf(f, "exon_id %s\"%s%s\"; ", 00472 (sep == ',') ? "\\" : "", 00473 el->exonId, 00474 (sep == ',') ? "\\" : ""); 00475 if (sep == ',') fputc('"',f); 00476 fputc(lastSep,f); 00477 }
| struct gffFile* gffRead | ( | char * | fileName | ) | [read] |
Definition at line 367 of file gff.c.
References gffFileAdd(), and gffFileNew().
00369 { 00370 struct gffFile *gff = gffFileNew(fileName); 00371 gffFileAdd(gff, fileName, 0); 00372 return gff; 00373 }
Here is the call graph for this function:

| static void gffSyntaxError | ( | char * | fileName, | |
| int | line, | |||
| char * | msg | |||
| ) | [static] |
Definition at line 82 of file gff.c.
References errAbort().
Referenced by gffFileAddRow().
Here is the call graph for this function:

Here is the caller graph for this function:

| static char* gffTnName | ( | char * | seqName, | |
| char * | groupName | |||
| ) | [static] |
Definition at line 88 of file gff.c.
References startsWith().
Referenced by gffFileAddRow().
00090 { 00091 static char nameBuf[512]; 00092 if (startsWith("gene-", groupName)) 00093 groupName += 5; 00094 if (startsWith("cc_", groupName)) 00095 groupName += 3; 00096 strcpy(nameBuf, groupName); 00097 return nameBuf; 00098 }
Here is the call graph for this function:

Here is the caller graph for this function:

| static boolean isGtfGroup | ( | char * | group | ) | [static] |
Definition at line 100 of file gff.c.
References countChars(), FALSE, and TRUE.
Referenced by gffFileAddRow(), and gffHasGtfGroup().
00102 { 00103 if (strstr(group, "gene_id") == NULL) 00104 return FALSE; 00105 if (countChars(group, '"') >= 2) 00106 return TRUE; 00107 if (strstr(group, "transcript_id") != NULL) 00108 return TRUE; 00109 return FALSE; 00110 }
Here is the call graph for this function:

Here is the caller graph for this function:

| static void parseGtfEnd | ( | char * | s, | |
| struct gffFile * | gff, | |||
| struct gffLine * | gl, | |||
| char * | fileName, | |||
| int | lineIx | |||
| ) | [static] |
Definition at line 133 of file gff.c.
References AllocVar, cloneString(), errAbort(), gffFile::exonHash, gffLine::exonId, gffLine::exonNumber, FALSE, gffLine::geneId, gffFile::geneIdHash, gffFile::geneIdList, gffLine::group, gffFile::groupHash, gffFile::groupList, hashAdd(), hashLookup(), gffLine::intronId, gffLine::intronStatus, gffFile::intronStatusHash, gffGroup::name, gffGeneId::name, hashEl::name, nextWord(), gffLine::proteinId, gffFile::proteinIdHash, readQuotedString(), sameString, gffLine::seq, skipLeadingSpaces(), slAddHead, gffLine::source, TRUE, hashEl::val, and warn().
Referenced by gffFileAddRow().
00137 { 00138 char *type, *val; 00139 struct hashEl *hel; 00140 bool gotSemi; 00141 00142 for (;;) 00143 { 00144 gotSemi = FALSE; 00145 if ((type = nextWord(&s)) == NULL) 00146 break; 00147 s = skipLeadingSpaces(s); 00148 if (NULL == s || s[0] == 0) 00149 errAbort("Unpaired type/val on end of gtf line %d of %s", lineIx, fileName); 00150 if (s[0] == '"' || s[0] == '\'') 00151 { 00152 val = s; 00153 readQuotedString(fileName, lineIx, s, val, &s); 00154 } 00155 else 00156 { 00157 int len; 00158 val = nextWord(&s); 00159 len = strlen(val) - 1; 00160 if (val[len] == ';') 00161 { 00162 val[len] = 0; 00163 len -= 1; 00164 gotSemi = TRUE; 00165 } 00166 if (len < 0) 00167 errAbort("Empty value for %s line %d of %s", type, lineIx, fileName); 00168 } 00169 if (s != NULL && !gotSemi) 00170 { 00171 s = strchr(s, ';'); 00172 if (s != NULL) 00173 ++s; 00174 } 00175 /* only use the first occurance of gene_id and transcript_id */ 00176 if (sameString("gene_id", type) && (gl->geneId == NULL)) 00177 { 00178 struct gffGeneId *gg; 00179 if ((hel = hashLookup(gff->geneIdHash, val)) == NULL) 00180 { 00181 AllocVar(gg); 00182 hel = hashAdd(gff->geneIdHash, val, gg); 00183 gg->name = hel->name; 00184 slAddHead(&gff->geneIdList, gg); 00185 } 00186 else 00187 { 00188 gg = hel->val; 00189 } 00190 gl->geneId = gg->name; 00191 } 00192 else if (sameString("transcript_id", type) && (gl->group == NULL)) 00193 { 00194 struct gffGroup *gg; 00195 if ((hel = hashLookup(gff->groupHash, val)) == NULL) 00196 { 00197 AllocVar(gg); 00198 hel = hashAdd(gff->groupHash, val, gg); 00199 gg->name = hel->name; 00200 gg->seq = gl->seq; 00201 gg->source = gl->source; 00202 slAddHead(&gff->groupList, gg); 00203 } 00204 else 00205 { 00206 gg = hel->val; 00207 } 00208 gl->group = gg->name; 00209 } 00210 else if (sameString("exon_id", type)) 00211 { 00212 if ((hel = hashLookup(gff->exonHash, val)) == NULL) 00213 hel = hashAdd(gff->exonHash, val, NULL); 00214 gl->exonId = hel->val; 00215 } 00216 else if (sameString("exon_number", type)) 00217 { 00218 if (!isdigit(val[0])) 00219 errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName); 00220 gl->exonNumber = atoi(val); 00221 } 00222 else if (sameString("intron_id", type)) 00223 gl->intronId = cloneString(val); 00224 else if (sameString("intron_status", type)) 00225 { 00226 if ((hel = hashLookup(gff->intronStatusHash, val)) == NULL) 00227 hel = hashAdd(gff->intronStatusHash, val, NULL); 00228 gl->intronStatus = hel->name; 00229 } 00230 else if (sameString("protein_id", type)) 00231 { 00232 if ((hel = hashLookup(gff->proteinIdHash, val)) == NULL) 00233 hel = hashAdd(gff->proteinIdHash, val, NULL); 00234 gl->proteinId = hel->name; 00235 } 00236 } 00237 if (gl->group == NULL) 00238 { 00239 if (gl->geneId == NULL) 00240 warn("No gene_id or transcript_id line %d of %s", lineIx, fileName); 00241 } 00242 }
Here is the call graph for this function:

Here is the caller graph for this function:

| static void readQuotedString | ( | char * | fileName, | |
| int | lineIx, | |||
| char * | in, | |||
| char * | out, | |||
| char ** | retNext | |||
| ) | [static] |
Definition at line 126 of file gff.c.
References errAbort(), and parseQuotedString().
Referenced by parseGtfEnd().
00128 { 00129 if (!parseQuotedString(in, out, retNext)) 00130 errAbort("Line %d of %s\n", lineIx, fileName); 00131 }
Here is the call graph for this function:

Here is the caller graph for this function:

char const rcsid[] = "$Id: gff.c,v 1.21 2007/02/01 00:43:16 kate Exp $" [static] |
1.5.2