00001
00002
00003
00004
00005
00006
00007
00008 #include "common.h"
00009 #include "dnautil.h"
00010 #include "oldGff.h"
00011 #include "dnaseq.h"
00012 #include "htmshell.h"
00013 #include "portable.h"
00014 #include "localmem.h"
00015
00016 static char const rcsid[] = "$Id: oldGff.c,v 1.6 2005/04/10 14:41:24 markd Exp $";
00017
00018 #define errfile stdout
00019
00020 static char _gffIdent[] = "##gff-version";
00021
00022 struct gffSegLine
00023 {
00024 char seqname[64];
00025 char source[64];
00026 char feature[64];
00027 long start, end;
00028 char score[62];
00029 char strand[4];
00030 char frame[4];
00031 char group[128];
00032 };
00033
00034 static int gffSegLineScan(struct gff* gff, struct gffSegLine *seg)
00035 {
00036 int scanned = sscanf(gff->buf, "%s %s %s %ld %ld %s %1s %s %s",
00037 seg->seqname, seg->source, seg->feature,
00038 &seg->start, &seg->end,
00039 seg->score, seg->strand, seg->frame, seg->group);
00040 return scanned;
00041 }
00042
00043 static boolean _gffSeekDoubleSharpLine();
00044
00045 boolean gffOpen(struct gff *gff, char *fileName)
00046
00047 {
00048 dnaUtilOpen();
00049
00050
00051 zeroBytes(gff, sizeof(*gff));
00052 gff->memPool = lmInit(16*1024);
00053 gff->fileSize = fileSize(fileName);
00054 if (gff->fileSize < 0 ||
00055 (gff->file = fopen(fileName, "rb")) == NULL)
00056 {
00057 warn("Couldn't find the file named %s\n", fileName);
00058 return FALSE;
00059 }
00060 strcpy(gff->fileName, fileName);
00061 gff->bufSize = ArraySize(gff->buf);
00062
00063
00064 _gffSeekDoubleSharpLine(gff);
00065 if (strncmp(gff->buf, _gffIdent, strlen(_gffIdent)) != 0)
00066 {
00067 warn("%s doesn't appear to be a .gff file\n", fileName);
00068 return FALSE;
00069 }
00070
00071 return TRUE;
00072 }
00073
00074 void gffClose(struct gff *gff)
00075
00076 {
00077 if (gff->file != NULL)
00078 fclose(gff->file);
00079 freeMem(gff->dna);
00080 lmCleanup(&gff->memPool);
00081 zeroBytes(gff, sizeof(*gff));
00082 }
00083
00084 static boolean _gffGetLine(struct gff *gff)
00085
00086
00087 {
00088 char *s;
00089 s = fgets(gff->buf, gff->bufSize, gff->file);
00090 if (s == NULL)
00091 {
00092 return FALSE;
00093 }
00094 gff->bytesInBuf = strlen(gff->buf);
00095 gff->readIx = 0;
00096 gff->lineNumber += 1;
00097 return TRUE;
00098 }
00099
00100 #if 0
00101 static boolean _gffAtEof(struct gff *gff)
00102
00103 {
00104 return gff->file == NULL;
00105 }
00106 #endif
00107
00108 #if 0
00109 static char _getGffChar(struct gff *gff)
00110
00111
00112 {
00113 if (gff->readIx >= gff->bytesInBuf)
00114 {
00115 if (!_gffGetLine(gff)) return 0;
00116 }
00117 return gff->buf[gff->readIx++];
00118 }
00119 #endif
00120
00121 static boolean _gffSeekDoubleSharpLine(struct gff *gff)
00122
00123 {
00124 for (;;)
00125 {
00126 if (!_gffGetLine(gff)) return FALSE;
00127 if (gff->bytesInBuf >= 2)
00128 if (gff->buf[0] == '#' && gff->buf[1] == '#')
00129 return TRUE;
00130 }
00131 }
00132
00133 static boolean _gffSeekDna(struct gff *gff)
00134
00135 {
00136 static char dnaIdent[] = "##DNA";
00137
00138 rewind(gff->file);
00139 for (;;)
00140 {
00141 if (!_gffGetLine(gff)) return FALSE;
00142 if (strncmp(gff->buf, dnaIdent, strlen(dnaIdent)) == 0)
00143 {
00144 sscanf(gff->buf, "##DNA %s", gff->dnaName);
00145 gff->bytesInBuf = 0;
00146 return TRUE;
00147 }
00148 }
00149 }
00150
00151 static boolean gffNextDnaLine(struct gff *gff)
00152
00153 {
00154 static char endIdent[] = "##end-DNA";
00155
00156 if (!_gffSeekDoubleSharpLine(gff))
00157 return FALSE;
00158
00159 if (strncmp(gff->buf, endIdent, strlen(endIdent))==0)
00160 {
00161 gff->bytesInBuf = 0;
00162 return FALSE;
00163 }
00164 return TRUE;
00165 }
00166
00167 boolean gffReadDna(struct gff *gff)
00168
00169 {
00170 long dnaSize = 0;
00171 DNA *dna;
00172 DNA *line;
00173 int lineCount;
00174 DNA b;
00175 if (gff->dna != NULL)
00176 return TRUE;
00177 if (!_gffSeekDna(gff))
00178 return FALSE;
00179 if ((gff->dna = wantMem(gff->fileSize)) == NULL)
00180 {
00181 warn("Couldn't allocate %ld bytes for DNA\n",
00182 gff->fileSize);
00183 return FALSE;
00184 }
00185 dna = gff->dna;
00186 for (;;)
00187 {
00188 if (!gffNextDnaLine(gff))
00189 break;
00190 line = gff->buf + gff->readIx;
00191 lineCount = gff->bytesInBuf-gff->readIx;
00192 while (--lineCount >= 0)
00193 {
00194 b = *line++;
00195 if ((b = ntChars[(int)b]) != 0)
00196 {
00197 *dna++ = b;
00198 dnaSize += 1;
00199 }
00200 }
00201 }
00202 gff->dnaSize = dnaSize;
00203 return TRUE;
00204 }
00205
00206 struct gffGene *gffFindGene(struct gff *gff, char *geneName)
00207
00208 {
00209 struct gffGene *g;
00210
00211 for (g=gff->genes; g!=NULL; g=g->next)
00212 {
00213 if (strcmp(geneName, g->name) == 0)
00214 return g;
00215 }
00216 return NULL;
00217 }
00218
00219 struct gffGene *gffFindGeneIgnoreCase(struct gff *gff, char *geneName)
00220
00221 {
00222 struct gffGene *g;
00223
00224 for (g=gff->genes; g!=NULL; g=g->next)
00225 {
00226 if (differentWord(geneName, g->name) == 0)
00227 return g;
00228 }
00229 return NULL;
00230 }
00231
00232
00233
00234 static void *gffNeedMem(struct gff *gff, int size)
00235 {
00236 return lmAlloc(gff->memPool, size);
00237 }
00238
00239 static void gffSegmentInsertSort(struct gffSegment **plist,
00240 struct gffSegment *seg)
00241
00242
00243
00244
00245
00246 {
00247 struct gffSegment *next;
00248 long segStart = seg->start;
00249
00250 for (;;)
00251 {
00252 next = *plist;
00253 if (next == NULL)
00254 break;
00255 if (next->start > segStart)
00256 break;
00257 plist = &(next->next);
00258 }
00259 seg->next = next;
00260 *plist = seg;
00261 }
00262
00263 static void offsetsFromExons(struct gffGene *gene)
00264
00265 {
00266 GffExon *exon;
00267 long end = 0;
00268 long start = 0x7fffffff;
00269 for (exon = gene->exons; exon != NULL; exon = exon->next)
00270 {
00271 if (exon->start < start)
00272 start = exon->start;
00273 if (exon->end > end)
00274 end = exon->end;
00275 }
00276 gene->start = start;
00277 gene->end = end;
00278 }
00279
00280 void gffPrintInfo(struct gff *gff, FILE *out)
00281
00282 {
00283 struct gffGene *gene;
00284
00285 fprintf(out, "\n%s\n", gff->fileName);
00286 fprintf(out, "DNA %s (%ld bases)\n",
00287 gff->dnaName, gff->dnaSize);
00288 fprintf(out, "%d genes\n", slCount(gff->genes));
00289 for (gene = gff->genes; gene != NULL; gene = gene->next)
00290 {
00291 fprintf(out, "gene %s has %ld bases, %d exons, %d introns\n",
00292 gene->name, gene->end - gene->start + 1,
00293 slCount(gene->exons), slCount(gene->introns));
00294 }
00295 }
00296
00297 static boolean checkWordCount(struct gff *gff, int wordCount)
00298 {
00299 if (wordCount >= 9)
00300 return TRUE;
00301 else
00302 {
00303 warn("???%s???\n", gff->buf);
00304 warn("Can't handle line %d of %s.\n",
00305 gff->lineNumber, gff->fileName);
00306 return FALSE;
00307 }
00308 }
00309
00310 boolean gffReadGenes(struct gff *gff)
00311
00312 {
00313 int wordCount;
00314 struct gffSegLine seg;
00315 char curGroup[128];
00316 struct gffGene *gene = NULL;
00317 GffIntron *intron = NULL;
00318 GffExon *exon = NULL;
00319 boolean warnedUnknown = FALSE;
00320 boolean isNewGene;
00321
00322 curGroup[0] = 0;
00323
00324
00325 for (;;)
00326 {
00327
00328 if (!_gffGetLine(gff))
00329 break;
00330 if (gff->buf[0] == '#')
00331 continue;
00332 wordCount = gffSegLineScan(gff, &seg);
00333 if (wordCount < 9)
00334 continue;
00335
00336
00337 if (seg.start > seg.end)
00338 {
00339 warn("start greater than end line %d of %s.\n",
00340 gff->lineNumber, gff->fileName);
00341 return FALSE;
00342 }
00343
00344
00345
00346 isNewGene = FALSE;
00347 if (strcmp(seg.group, curGroup) != 0)
00348 {
00349 strcpy(curGroup, seg.group);
00350 if ((gene = gffFindGene(gff, seg.group)) == NULL)
00351 {
00352
00353 if (!checkWordCount(gff, wordCount)) return FALSE;
00354 isNewGene = TRUE;
00355 gene = gffNeedMem(gff, sizeof(*gene));
00356 strcpy(gene->name, seg.group);
00357 slAddTail(&gff->genes, gene);
00358 gene->strand = seg.strand[0];
00359 gene->frame = atoi(seg.frame);
00360 if (differentWord(seg.feature, "CDS") == 0)
00361 {
00362 gene->start = seg.start-1;
00363 gene->end = seg.end-1;
00364 }
00365 }
00366 }
00367
00368
00369
00370 if (differentWord(seg.feature, "CDS")==0)
00371 {
00372
00373
00374
00375 if (!checkWordCount(gff, wordCount)) return FALSE;
00376 if (!isNewGene)
00377 {
00378 if (gene->start != 0 || gene->end != 0)
00379 {
00380 warn("Warning duplicate CDS for %s\n",
00381 seg.group);
00382 warn("Line %d of %s\n",
00383 gff->lineNumber, gff->fileName);
00384 }
00385 }
00386 }
00387 else if (differentWord(seg.feature, "SE") == 0
00388 || differentWord(seg.feature, "IE") == 0
00389 || differentWord(seg.feature, "FE") == 0
00390 || differentWord(seg.feature, "E") == 0
00391 || differentWord(seg.feature, "exon") == 0)
00392 {
00393
00394
00395
00396
00397 if (!checkWordCount(gff, wordCount)) return FALSE;
00398 exon = gffNeedMem(gff, sizeof(*exon));
00399 exon->start = seg.start-1;
00400 exon->end = seg.end-1;
00401 exon->frame = atoi(seg.frame);
00402 gffSegmentInsertSort(&gene->exons, exon);
00403 }
00404 else if (differentWord(seg.feature, "I") == 0
00405 || differentWord(seg.feature, "intron") == 0)
00406 {
00407
00408 if (!checkWordCount(gff, wordCount)) return FALSE;
00409 intron = gffNeedMem(gff, sizeof(*intron));
00410 intron->start = seg.start-1;
00411 intron->end = seg.end-1;
00412 intron->frame = atoi(seg.frame);
00413 gffSegmentInsertSort(&gene->introns, intron);
00414 }
00415 else if (strcmp(seg.feature, "IG") == 0)
00416 {
00417
00418 }
00419 else
00420 {
00421 if (!warnedUnknown)
00422 {
00423 warn("Unknown feature %s line %d of %s, ignoring\n",
00424 seg.feature, gff->lineNumber, gff->fileName);
00425 warnedUnknown = TRUE;
00426 }
00427 }
00428 }
00429
00430
00431 for (gene = gff->genes; gene != NULL; gene = gene->next)
00432 {
00433 if (gene->start >= gene->end)
00434 {
00435 offsetsFromExons(gene);
00436 }
00437 }
00438 return TRUE;
00439 }
00440
00441 static boolean geneDna(struct gff *gff, struct gffGene *gene,
00442 int leftExtra, int rightExtra, char **retDna, long *retDnaSize,
00443 int *retStartOffset)
00444
00445 {
00446 char *dna;
00447 char *pt;
00448 long geneSize;
00449 long i;
00450 long seqStart, seqEnd, seqSize;
00451
00452
00453
00454 geneSize = gene->end - gene->start + 1;
00455 if (geneSize <= 0 || geneSize >= 1000000)
00456 return FALSE;
00457
00458
00459
00460
00461 seqStart = gene->start - leftExtra;
00462 seqEnd = gene->end + rightExtra + 1;
00463 if (seqStart < 0)
00464 seqStart = 0;
00465 if (seqEnd > gff->dnaSize)
00466 seqEnd = gff->dnaSize;
00467 seqSize = seqEnd - seqStart;
00468
00469
00470 dna = needMem(seqSize+1);
00471 pt = dna;
00472 for (i=0; i<seqSize; i++)
00473 *pt++ = gff->dna[seqStart+i];
00474 *pt = 0;
00475
00476
00477 *retDna = dna;
00478 *retDnaSize = seqSize;
00479 *retStartOffset = (gene->start - seqStart);
00480 return TRUE;
00481 }
00482
00483 static void fixDirectionAndOffsets(struct gffGene *gene, char *dna, long dnaSize, int newStart)
00484
00485
00486
00487
00488 {
00489 long oldStart;
00490 long offset;
00491 GffIntron *intron;
00492 GffExon *exon;
00493 long temp;
00494
00495 oldStart = gene->start;
00496 offset = oldStart - newStart;
00497 gene->start -= offset;
00498 gene->end -= offset;
00499 for (intron = gene->introns; intron != NULL; intron = intron->next)
00500 {
00501 intron->start -= offset;
00502 intron->end -= offset;
00503 }
00504 for (exon = gene->exons; exon != NULL; exon = exon->next)
00505 {
00506 exon->start -= offset;
00507 exon->end -= offset;
00508 }
00509 if (gene->strand == '-')
00510 {
00511 reverseComplement(dna, dnaSize);
00512 temp = reverseOffset(gene->start, dnaSize);
00513 gene->start = reverseOffset(gene->end, dnaSize);
00514 gene->end = temp;
00515 for (intron = gene->introns; intron != NULL; intron = intron->next)
00516 {
00517 temp = reverseOffset(intron->start, dnaSize);
00518 intron->start = reverseOffset(intron->end, dnaSize);
00519 intron->end = temp;
00520 }
00521 for (exon = gene->exons; exon != NULL; exon = exon->next)
00522 {
00523 temp = reverseOffset(exon->start, dnaSize);
00524 exon->start = reverseOffset(exon->end, dnaSize);
00525 exon->end = temp;
00526 }
00527 slReverse(&gene->introns);
00528 slReverse(&gene->exons);
00529 gene->strand = '+';
00530 }
00531 }
00532
00533 static struct gffSegment *dupeSegmentList(struct gffSegment *oldList,
00534 struct gffSegment *newList)
00535
00536 {
00537 struct gffSegment *oldEl, *newEl;
00538
00539 if (oldList == NULL)
00540 return NULL;
00541 for (oldEl = oldList, newEl = newList; oldEl != NULL; oldEl=oldEl->next, newEl += 1)
00542 {
00543 memcpy(newEl, oldEl, sizeof(*newEl));
00544 newEl->next = ((oldEl->next == NULL) ? NULL : newEl+1);
00545 }
00546 return newList;
00547 }
00548
00549 struct gffGene *gffDupeGeneAndSurrounds(struct gff *gff, struct gffGene *oldGene,
00550 int leftExtra, int rightExtra)
00551
00552
00553
00554
00555
00556 {
00557 struct gffGene *g;
00558 int intronCount = slCount(oldGene->introns);
00559 int exonCount = slCount(oldGene->exons);
00560 int memSize = sizeof(*g) + (intronCount + exonCount) * sizeof(struct gffSegment);
00561 char *memPt;
00562 int firstExonOffset;
00563
00564
00565 memPt = needMem(memSize);
00566 g = (struct gffGene *)memPt;
00567 memPt += sizeof(*g);
00568 g->exons = (struct gffSegment *)memPt;
00569 memPt += exonCount*sizeof(struct gffSegment);
00570 g->introns = (struct gffSegment *)memPt;
00571
00572 g->next = NULL;
00573 g->start = oldGene->start;
00574 g->end = oldGene->end;
00575 g->strand = oldGene->strand;
00576 memcpy(g->name, oldGene->name, sizeof(g->name));
00577 g->exons = dupeSegmentList(oldGene->exons, g->exons);
00578 g->introns = dupeSegmentList(oldGene->introns, g->introns);
00579 if (!geneDna(gff, oldGene, leftExtra, rightExtra,
00580 &g->dna, &g->dnaSize, &firstExonOffset))
00581 {
00582 gffFreeGene(&g);
00583 return NULL;
00584 }
00585 fixDirectionAndOffsets(g, g->dna, g->dnaSize, firstExonOffset);
00586 return g;
00587 }
00588
00589 struct gffGene *gffDupeGene(struct gff *gff, struct gffGene *oldGene)
00590
00591 {
00592 return gffDupeGeneAndSurrounds(gff, oldGene, 0, 0);
00593 }
00594
00595 struct gffGene *gffGeneWithOwnDna(struct gff *gff, char *geneName)
00596
00597 {
00598 struct gffGene *oldGene;
00599
00600 oldGene = gffFindGeneIgnoreCase(gff, geneName);
00601 if (oldGene == NULL)
00602 return NULL;
00603 return gffDupeGene(gff, oldGene);
00604 }
00605
00606 void gffFreeGene(struct gffGene **pGene)
00607
00608
00609
00610
00611 {
00612 struct gffGene *g = *pGene;
00613 if (g == NULL)
00614 return;
00615 freeMem(g->dna);
00616 freeMem(g);
00617 *pGene = NULL;
00618 }
00619
00620 struct dnaSeq *gffReadDnaSeq(char *fileName)
00621
00622 {
00623 struct gff gff;
00624 struct dnaSeq *seq = NULL;
00625
00626 if (!gffOpen(&gff, fileName))
00627 return NULL;
00628 if (gffReadDna(&gff))
00629 {
00630 seq = newDnaSeq(gff.dna, gff.dnaSize, gff.dnaName);
00631 gff.dna = NULL;
00632 }
00633 gffClose(&gff);
00634 return seq;
00635 }
00636
00637 boolean gffOpenAndRead(struct gff *gff, char *fileName)
00638
00639 {
00640 if (gffOpen(gff, fileName))
00641 if (gffReadDna(gff))
00642 if (gffReadGenes(gff))
00643 return TRUE;
00644 gffClose(gff);
00645 return FALSE;
00646 }