00001
00002
00003
00004
00005
00006
00007 #include "common.h"
00008 #include "hash.h"
00009 #include "linefile.h"
00010 #include "gff.h"
00011 #include "obscure.h"
00012
00013 static char const rcsid[] = "$Id: gff.c,v 1.21 2007/02/01 00:43:16 kate Exp $";
00014
00015 void gffGroupFree(struct gffGroup **pGroup)
00016
00017 {
00018 struct gffGroup *group;
00019 if ((group = *pGroup) != NULL)
00020 {
00021 slFreeList(&group->lineList);
00022 freez(pGroup);
00023 }
00024 }
00025
00026 void gffGroupFreeList(struct gffGroup **pList)
00027
00028 {
00029 struct gffGroup *el, *next;
00030 for (el = *pList; el != NULL; el = next)
00031 {
00032 next = el->next;
00033 gffGroupFree(&el);
00034 }
00035 *pList = NULL;
00036 }
00037
00038
00039 void gffFileFree(struct gffFile **pGff)
00040
00041 {
00042 struct gffFile *gff;
00043 if ((gff = *pGff) != NULL)
00044 {
00045 freeMem(gff->fileName);
00046 freeHash(&gff->seqHash);
00047 freeHash(&gff->sourceHash);
00048 freeHash(&gff->featureHash);
00049 freeHash(&gff->groupHash);
00050 freeHash(&gff->geneIdHash);
00051 freeHash(&gff->exonHash);
00052 freeHash(&gff->intronStatusHash);
00053 freeHash(&gff->proteinIdHash);
00054 slFreeList(&gff->lineList);
00055 slFreeList(&gff->seqList);
00056 slFreeList(&gff->sourceList);
00057 slFreeList(&gff->featureList);
00058 slFreeList(&gff->geneIdList);
00059 gffGroupFreeList(&gff->groupList);
00060 freez(pGff);
00061 }
00062 }
00063
00064 int gffLineCmp(const void *va, const void *vb)
00065
00066 {
00067 const struct gffLine *a = *((struct gffLine **)va);
00068 const struct gffLine *b = *((struct gffLine **)vb);
00069 int diff;
00070
00071
00072
00073 diff = strcmp(a->seq, b->seq);
00074 if (diff == 0)
00075 diff = a->start - b->start;
00076 if (diff == 0)
00077 diff = a->end - b->end;
00078 return diff;
00079 }
00080
00081
00082 static void gffSyntaxError(char *fileName, int line, char *msg)
00083
00084 {
00085 errAbort("%s Bad line %d of %s:\n", msg, line, fileName);
00086 }
00087
00088 static char *gffTnName(char *seqName, char *groupName)
00089
00090 {
00091 static char nameBuf[512];
00092 if (startsWith("gene-", groupName))
00093 groupName += 5;
00094 if (startsWith("cc_", groupName))
00095 groupName += 3;
00096 strcpy(nameBuf, groupName);
00097 return nameBuf;
00098 }
00099
00100 static boolean isGtfGroup(char *group)
00101
00102 {
00103 if (strstr(group, "gene_id") == NULL)
00104 return FALSE;
00105 if (countChars(group, '"') >= 2)
00106 return TRUE;
00107 if (strstr(group, "transcript_id") != NULL)
00108 return TRUE;
00109 return FALSE;
00110 }
00111
00112 boolean gffHasGtfGroup(char *line)
00113
00114 {
00115 char *words[10];
00116 char *dupe = cloneString(line);
00117 int wordCt = chopTabs(dupe, words);
00118 boolean isGtf = FALSE;
00119 if (wordCt >= 9)
00120 if (isGtfGroup(words[8]))
00121 isGtf = TRUE;
00122 freeMem(dupe);
00123 return isGtf;
00124 }
00125
00126 static void readQuotedString(char *fileName, int lineIx, char *in, char *out, char **retNext)
00127
00128 {
00129 if (!parseQuotedString(in, out, retNext))
00130 errAbort("Line %d of %s\n", lineIx, fileName);
00131 }
00132
00133 static void parseGtfEnd(char *s, struct gffFile *gff, struct gffLine *gl,
00134 char *fileName, int lineIx)
00135
00136
00137 {
00138 char *type, *val;
00139 struct hashEl *hel;
00140 bool gotSemi;
00141
00142 for (;;)
00143 {
00144 gotSemi = FALSE;
00145 if ((type = nextWord(&s)) == NULL)
00146 break;
00147 s = skipLeadingSpaces(s);
00148 if (NULL == s || s[0] == 0)
00149 errAbort("Unpaired type/val on end of gtf line %d of %s", lineIx, fileName);
00150 if (s[0] == '"' || s[0] == '\'')
00151 {
00152 val = s;
00153 readQuotedString(fileName, lineIx, s, val, &s);
00154 }
00155 else
00156 {
00157 int len;
00158 val = nextWord(&s);
00159 len = strlen(val) - 1;
00160 if (val[len] == ';')
00161 {
00162 val[len] = 0;
00163 len -= 1;
00164 gotSemi = TRUE;
00165 }
00166 if (len < 0)
00167 errAbort("Empty value for %s line %d of %s", type, lineIx, fileName);
00168 }
00169 if (s != NULL && !gotSemi)
00170 {
00171 s = strchr(s, ';');
00172 if (s != NULL)
00173 ++s;
00174 }
00175
00176 if (sameString("gene_id", type) && (gl->geneId == NULL))
00177 {
00178 struct gffGeneId *gg;
00179 if ((hel = hashLookup(gff->geneIdHash, val)) == NULL)
00180 {
00181 AllocVar(gg);
00182 hel = hashAdd(gff->geneIdHash, val, gg);
00183 gg->name = hel->name;
00184 slAddHead(&gff->geneIdList, gg);
00185 }
00186 else
00187 {
00188 gg = hel->val;
00189 }
00190 gl->geneId = gg->name;
00191 }
00192 else if (sameString("transcript_id", type) && (gl->group == NULL))
00193 {
00194 struct gffGroup *gg;
00195 if ((hel = hashLookup(gff->groupHash, val)) == NULL)
00196 {
00197 AllocVar(gg);
00198 hel = hashAdd(gff->groupHash, val, gg);
00199 gg->name = hel->name;
00200 gg->seq = gl->seq;
00201 gg->source = gl->source;
00202 slAddHead(&gff->groupList, gg);
00203 }
00204 else
00205 {
00206 gg = hel->val;
00207 }
00208 gl->group = gg->name;
00209 }
00210 else if (sameString("exon_id", type))
00211 {
00212 if ((hel = hashLookup(gff->exonHash, val)) == NULL)
00213 hel = hashAdd(gff->exonHash, val, NULL);
00214 gl->exonId = hel->val;
00215 }
00216 else if (sameString("exon_number", type))
00217 {
00218 if (!isdigit(val[0]))
00219 errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName);
00220 gl->exonNumber = atoi(val);
00221 }
00222 else if (sameString("intron_id", type))
00223 gl->intronId = cloneString(val);
00224 else if (sameString("intron_status", type))
00225 {
00226 if ((hel = hashLookup(gff->intronStatusHash, val)) == NULL)
00227 hel = hashAdd(gff->intronStatusHash, val, NULL);
00228 gl->intronStatus = hel->name;
00229 }
00230 else if (sameString("protein_id", type))
00231 {
00232 if ((hel = hashLookup(gff->proteinIdHash, val)) == NULL)
00233 hel = hashAdd(gff->proteinIdHash, val, NULL);
00234 gl->proteinId = hel->name;
00235 }
00236 }
00237 if (gl->group == NULL)
00238 {
00239 if (gl->geneId == NULL)
00240 warn("No gene_id or transcript_id line %d of %s", lineIx, fileName);
00241 }
00242 }
00243
00244 void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount,
00245 char *fileName, int lineIx)
00246
00247 {
00248 struct hashEl *hel;
00249 struct gffLine *gl;
00250
00251 if (wordCount < 8)
00252 gffSyntaxError(fileName, lineIx, "Word count less than 8 ");
00253 AllocVar(gl);
00254
00255 if ((hel = hashLookup(gff->seqHash, words[0])) == NULL)
00256 {
00257 struct gffSeqName *el;
00258 AllocVar(el);
00259 hel = hashAdd(gff->seqHash, words[0], el);
00260 el->name = hel->name;
00261 slAddHead(&gff->seqList, el);
00262 }
00263 gl->seq = hel->name;
00264
00265 if ((hel = hashLookup(gff->sourceHash, words[1])) == NULL)
00266 {
00267 struct gffSource *el;
00268 AllocVar(el);
00269 hel = hashAdd(gff->sourceHash, words[1], el);
00270 el->name = hel->name;
00271 slAddHead(&gff->sourceList, el);
00272 }
00273 gl->source = hel->name;
00274
00275 if ((hel = hashLookup(gff->featureHash, words[2])) == NULL)
00276 {
00277 struct gffFeature *el;
00278 AllocVar(el);
00279 hel = hashAdd(gff->featureHash, words[2], el);
00280 el->name = hel->name;
00281 slAddHead(&gff->featureList, el);
00282 }
00283 gl->feature = hel->name;
00284
00285 if (!isdigit(words[3][0]) || !isdigit(words[4][0]))
00286 gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number ");
00287 gl->start = atoi(words[3])-1 + baseOffset;
00288 gl->end = atoi(words[4]) + baseOffset;
00289 gl->score = atof(words[5]);
00290 gl->strand = words[6][0];
00291 gl->frame = words[7][0];
00292
00293 if (wordCount >= 9)
00294 {
00295 if (!gff->typeKnown)
00296 {
00297 gff->typeKnown = TRUE;
00298 gff->isGtf = isGtfGroup(words[8]);
00299 }
00300 if (gff->isGtf)
00301 {
00302 parseGtfEnd(words[8], gff, gl, fileName, lineIx);
00303 }
00304 else
00305 {
00306 char *tnName = gffTnName(gl->seq, words[8]);
00307 if ((hel = hashLookup(gff->groupHash, tnName)) == NULL)
00308 {
00309 struct gffGroup *group;
00310 AllocVar(group);
00311 hel = hashAdd(gff->groupHash, tnName, group);
00312 group->name = hel->name;
00313 group->seq = gl->seq;
00314 group->source = gl->source;
00315 slAddHead(&gff->groupList, group);
00316 }
00317 gl->group = hel->name;
00318 }
00319 }
00320 slAddHead(&gff->lineList, gl);
00321 }
00322
00323
00324 void gffFileAdd(struct gffFile *gff, char *fileName, int baseOffset)
00325
00326 {
00327
00328 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00329 char *line, *words[9];
00330 int lineSize, wordCount;
00331
00332 while (lineFileNext(lf, &line, &lineSize))
00333 {
00334 if (line[0] != '#')
00335 {
00336 wordCount = chopTabs(line, words);
00337 if (wordCount > 0)
00338 gffFileAddRow(gff, baseOffset, words, wordCount, lf->fileName, lf->lineIx);
00339 }
00340 }
00341 slReverse(&gff->lineList);
00342 slReverse(&gff->seqList);
00343 slReverse(&gff->sourceList);
00344 slReverse(&gff->featureList);
00345 slReverse(&gff->groupList);
00346 slReverse(&gff->geneIdList);
00347 lineFileClose(&lf);
00348 }
00349
00350 struct gffFile *gffFileNew(char *fileName)
00351
00352 {
00353 struct gffFile *gff;
00354 AllocVar(gff);
00355 gff->fileName = cloneString(fileName);
00356 gff->seqHash = newHash(6);
00357 gff->sourceHash = newHash(6);
00358 gff->featureHash = newHash(6);
00359 gff->groupHash = newHash(12);
00360 gff->geneIdHash = newHash(12);
00361 gff->exonHash = newHash(16);
00362 gff->intronStatusHash = newHash(4);
00363 gff->proteinIdHash = newHash(12);
00364 return gff;
00365 }
00366
00367 struct gffFile *gffRead(char *fileName)
00368
00369 {
00370 struct gffFile *gff = gffFileNew(fileName);
00371 gffFileAdd(gff, fileName, 0);
00372 return gff;
00373 }
00374
00375 static void getGroupBoundaries(struct gffGroup *group)
00376
00377 {
00378 struct gffLine *line;
00379 int start = 0x3fffffff;
00380 int end = -start;
00381 line = group->lineList;
00382 group->strand = line->strand;
00383 for (; line != NULL; line = line->next)
00384 {
00385 if (start > line->start)
00386 start = line->start;
00387 if (end < line->end)
00388 end = line->end;
00389 }
00390 group->start = start;
00391 group->end = end;
00392 }
00393
00394 void gffGroupLines(struct gffFile *gff)
00395
00396
00397 {
00398 struct gffLine *line, *nextLine;
00399 struct hash *groupHash = gff->groupHash;
00400 char *groupName;
00401 struct gffGroup *group;
00402 struct gffLine *ungroupedLines = NULL;
00403
00404 for (line = gff->lineList; line != NULL; line = nextLine)
00405 {
00406 nextLine = line->next;
00407 if ((groupName = line->group) != NULL)
00408 {
00409 struct hashEl *hel = hashLookup(groupHash, groupName);
00410 group = hel->val;
00411 slAddHead(&group->lineList, line);
00412 }
00413 else
00414 {
00415 slAddHead(&ungroupedLines, line);
00416 }
00417 }
00418
00419
00420 slReverse(&ungroupedLines);
00421 gff->lineList = ungroupedLines;
00422
00423
00424 for (group = gff->groupList; group != NULL; group = group->next)
00425 {
00426 slSort(&group->lineList, gffLineCmp);
00427 getGroupBoundaries(group);
00428 }
00429 }
00430
00431 void gffOutput(struct gffLine *el, FILE *f, char sep, char lastSep)
00432
00433 {
00434 if (sep == ',') fputc('"',f);
00435 fprintf(f, "%s", el->seq);
00436 if (sep == ',') fputc('"',f);
00437 fputc(sep,f);
00438 if (sep == ',') fputc('"',f);
00439 fprintf(f, "%s", el->source);
00440 if (sep == ',') fputc('"',f);
00441 fputc(sep,f);
00442 if (sep == ',') fputc('"',f);
00443 fprintf(f, "%s", el->feature);
00444 if (sep == ',') fputc('"',f);
00445 fputc(sep,f);
00446 fprintf(f, "%u", el->start+1);
00447 fputc(sep,f);
00448 fprintf(f, "%u", el->end);
00449 fputc(sep,f);
00450 fprintf(f, "%f", el->score);
00451 fputc(sep,f);
00452 if (sep == ',') fputc('"',f);
00453 fprintf(f, "%c", el->strand);
00454 if (sep == ',') fputc('"',f);
00455 fputc(sep,f);
00456 if (sep == ',') fputc('"',f);
00457 fprintf(f, "%c", el->frame);
00458 if (sep == ',') fputc('"',f);
00459 fputc(sep,f);
00460 if (sep == ',') fputc('"',f);
00461 if (el->geneId != NULL)
00462 fprintf(f, "gene_id %s\"%s%s\"; ",
00463 (sep == ',') ? "\\" : "",
00464 el->geneId,
00465 (sep == ',') ? "\\" : "");
00466 fprintf(f, "transcript_id %s\"%s%s\"; ",
00467 (sep == ',') ? "\\" : "",
00468 el->group,
00469 (sep == ',') ? "\\" : "");
00470 if (el->exonId != NULL)
00471 fprintf(f, "exon_id %s\"%s%s\"; ",
00472 (sep == ',') ? "\\" : "",
00473 el->exonId,
00474 (sep == ',') ? "\\" : "");
00475 if (sep == ',') fputc('"',f);
00476 fputc(lastSep,f);
00477 }
00478