lib/oldGff.c

Go to the documentation of this file.
00001 /* oldGff - module for reading GFFs.  This is largely if not
00002  * entirely superceded by the gff module. 
00003  *
00004  *
00005  * This file is copyright 2002 Jim Kent, but license is hereby
00006  * granted for all use - public, private or commercial. */
00007 
00008 #include "common.h"
00009 #include "dnautil.h"
00010 #include "oldGff.h"
00011 #include "dnaseq.h"
00012 #include "htmshell.h"
00013 #include "portable.h"
00014 #include "localmem.h"
00015 
00016 static char const rcsid[] = "$Id: oldGff.c,v 1.6 2005/04/10 14:41:24 markd Exp $";
00017 
00018 #define errfile stdout
00019 
00020 static char _gffIdent[] = "##gff-version";
00021 
00022 struct gffSegLine
00023     {
00024     char seqname[64];   /* Name of DNA sequence this refers to. */
00025     char source[64];    /* Who put this segment here... */
00026     char feature[64];      /* CDS, E, I, exon, intron, ??? */
00027     long start, end;   /* Offsets into DNA array, end inclusive */
00028     char score[62];     /* A number between 0 and 1 */
00029     char strand[4];     /* + or - */
00030     char frame[4];     /* 0, 1, 2, or . */
00031     char group[128];  /* Name of gene  cosmid.number. */
00032     };
00033 
00034 static int gffSegLineScan(struct gff* gff, struct gffSegLine *seg)
00035 {
00036     int scanned = sscanf(gff->buf, "%s %s %s %ld %ld %s %1s %s %s",
00037         seg->seqname, seg->source, seg->feature,
00038         &seg->start, &seg->end,
00039         seg->score, seg->strand, seg->frame, seg->group);
00040     return scanned;
00041 }
00042 
00043 static boolean _gffSeekDoubleSharpLine();
00044 
00045 boolean gffOpen(struct gff *gff, char *fileName)
00046 /* Initialize gff structure and open file for it. */
00047 {
00048     dnaUtilOpen();
00049 
00050     /* Initialize structure and open file. */
00051     zeroBytes(gff, sizeof(*gff));
00052     gff->memPool = lmInit(16*1024);
00053     gff->fileSize = fileSize(fileName);
00054     if (gff->fileSize < 0 ||
00055        (gff->file = fopen(fileName, "rb")) == NULL)
00056             {
00057             warn("Couldn't find the file named %s\n", fileName);
00058             return FALSE;
00059             }
00060     strcpy(gff->fileName, fileName);
00061     gff->bufSize = ArraySize(gff->buf);
00062 
00063     /* Make sure it's a gff file. */
00064     _gffSeekDoubleSharpLine(gff);
00065     if (strncmp(gff->buf, _gffIdent, strlen(_gffIdent)) != 0)
00066         {
00067         warn("%s doesn't appear to be a .gff file\n", fileName);
00068         return FALSE;
00069         }
00070 
00071     return TRUE;
00072 }
00073 
00074 void gffClose(struct gff *gff)
00075 /* Close down gff structure. */
00076 {
00077 if (gff->file != NULL)
00078     fclose(gff->file);
00079 freeMem(gff->dna);
00080 lmCleanup(&gff->memPool);
00081 zeroBytes(gff, sizeof(*gff));
00082 }
00083 
00084 static boolean _gffGetLine(struct gff *gff)
00085 /* Get the next line into a gff file.  (private)
00086  * return FALSE at EOF or if problem. */
00087 {
00088 char *s;
00089 s = fgets(gff->buf, gff->bufSize, gff->file);
00090 if (s == NULL)
00091     {
00092     return FALSE;
00093     }
00094 gff->bytesInBuf = strlen(gff->buf);
00095 gff->readIx = 0;
00096 gff->lineNumber += 1;
00097 return TRUE;
00098 }
00099 
00100 #if 0 /* unused */
00101 static boolean _gffAtEof(struct gff *gff)
00102 /* Returns TRUE if at the end of gff file. */
00103 {
00104 return gff->file == NULL;
00105 }
00106 #endif  
00107 
00108 #if 0 /* unused */
00109 static char _getGffChar(struct gff *gff)
00110 /* Return next byte (not next base) in gff file. Return zero
00111  * if at end of file. */
00112 {
00113 if (gff->readIx >= gff->bytesInBuf)
00114         {
00115         if (!_gffGetLine(gff)) return 0;
00116         }
00117 return gff->buf[gff->readIx++];
00118 }
00119 #endif
00120 
00121 static boolean _gffSeekDoubleSharpLine(struct gff *gff)
00122 /* Go find next line that begins with ## */
00123 {
00124 for (;;)
00125     {
00126     if (!_gffGetLine(gff)) return FALSE;
00127     if (gff->bytesInBuf >= 2)
00128         if (gff->buf[0] == '#' && gff->buf[1] == '#') 
00129                 return TRUE;
00130     }
00131 }
00132 
00133 static boolean _gffSeekDna(struct gff *gff)
00134 /* Skip through file until you get the DNA */ 
00135 {
00136 static char dnaIdent[] = "##DNA";
00137 
00138 rewind(gff->file);
00139 for (;;)
00140     {
00141     if (!_gffGetLine(gff)) return FALSE;
00142     if (strncmp(gff->buf, dnaIdent, strlen(dnaIdent)) == 0)
00143         {
00144         sscanf(gff->buf, "##DNA %s", gff->dnaName);
00145         gff->bytesInBuf = 0; /* We're done with gff line. */
00146         return TRUE;
00147         }
00148     }
00149 }
00150 
00151 static boolean gffNextDnaLine(struct gff *gff)
00152 /* Fetches next line of DNA. */
00153 {
00154 static char endIdent[] = "##end-DNA";
00155 
00156 if (!_gffSeekDoubleSharpLine(gff)) 
00157     return FALSE;
00158 /* Check to see if have reached end of DNA sequence */
00159 if (strncmp(gff->buf, endIdent, strlen(endIdent))==0)
00160     {
00161     gff->bytesInBuf = 0; /* We're done with gff line. */
00162     return FALSE;
00163     }
00164 return TRUE;
00165 }
00166 
00167 boolean gffReadDna(struct gff *gff)
00168 /* Read all the DNA in a file. */
00169 {
00170 long dnaSize = 0;
00171 DNA *dna;
00172 DNA *line;
00173 int lineCount;
00174 DNA b;
00175 if (gff->dna != NULL)
00176         return TRUE; /* We already read it. */
00177 if (!_gffSeekDna(gff))
00178         return FALSE;
00179 if ((gff->dna = wantMem(gff->fileSize)) == NULL)
00180     {
00181     warn("Couldn't allocate %ld bytes for DNA\n",
00182         gff->fileSize);
00183     return FALSE;
00184     }
00185 dna = gff->dna;
00186 for (;;)
00187     {
00188     if (!gffNextDnaLine(gff))
00189         break;
00190     line = gff->buf + gff->readIx;
00191     lineCount = gff->bytesInBuf-gff->readIx;
00192     while (--lineCount >= 0)
00193         {
00194         b = *line++;
00195         if ((b = ntChars[(int)b]) != 0)
00196             {
00197             *dna++ = b;
00198             dnaSize += 1;
00199             }
00200         }
00201     }
00202 gff->dnaSize = dnaSize;
00203 return TRUE;
00204 }
00205 
00206 struct gffGene *gffFindGene(struct gff *gff, char *geneName)
00207 /* Find gene with given name.  Case sensitive. */
00208 {
00209 struct gffGene *g;
00210 
00211 for (g=gff->genes; g!=NULL; g=g->next)
00212     {
00213     if (strcmp(geneName, g->name) == 0)
00214         return g;
00215     }
00216 return NULL;
00217 }
00218 
00219 struct gffGene *gffFindGeneIgnoreCase(struct gff *gff, char *geneName)
00220 /* Find gene with given name.  Not case sensitive. */
00221 {
00222 struct gffGene *g;
00223 
00224 for (g=gff->genes; g!=NULL; g=g->next)
00225     {
00226     if (differentWord(geneName, g->name) == 0)
00227         return g;
00228     }
00229 return NULL;
00230 }
00231 
00232 /* Allocate memory and clear it to zero.  Report error
00233  * and kill program if can't allocate it. */
00234 static void *gffNeedMem(struct gff *gff, int size)
00235 {
00236 return lmAlloc(gff->memPool, size);
00237 }
00238 
00239 static void gffSegmentInsertSort(struct gffSegment **plist, 
00240         struct gffSegment *seg)
00241 /* Insert segment on list, keeping list ordered by start field 
00242    parameters:
00243          gffSegment **plist;     Pointer to list. 
00244          gffSegment *seg;        Segment to insert 
00245  */
00246 {
00247 struct gffSegment *next;
00248 long segStart = seg->start;
00249 
00250 for (;;)
00251     {
00252     next = *plist;
00253     if (next == NULL)
00254         break;
00255     if (next->start > segStart)
00256         break;
00257     plist = &(next->next);
00258     }
00259 seg->next = next;   
00260 *plist = seg;
00261 }
00262 
00263 static void offsetsFromExons(struct gffGene *gene)
00264 /* Figure out start and end offsets of gene from it's exons. */
00265 {
00266 GffExon *exon;
00267 long end = 0;
00268 long start = 0x7fffffff; /* I should use a .h file constant here... */
00269 for (exon = gene->exons; exon != NULL; exon = exon->next)
00270     {
00271     if (exon->start < start)
00272         start = exon->start;
00273     if (exon->end > end)
00274         end = exon->end;
00275     }
00276 gene->start = start;
00277 gene->end = end;
00278 }
00279 
00280 void gffPrintInfo(struct gff *gff, FILE *out)
00281 /* Print summary info about file. */
00282 {
00283 struct gffGene *gene;
00284 
00285 fprintf(out, "\n%s\n", gff->fileName);
00286 fprintf(out, "DNA %s (%ld bases)\n", 
00287         gff->dnaName, gff->dnaSize);
00288 fprintf(out, "%d genes\n", slCount(gff->genes));
00289 for (gene = gff->genes; gene != NULL; gene = gene->next)
00290     {
00291     fprintf(out, "gene %s has %ld bases, %d exons, %d introns\n",
00292         gene->name, gene->end - gene->start + 1,
00293         slCount(gene->exons), slCount(gene->introns));
00294     }
00295 }
00296 
00297 static boolean checkWordCount(struct gff *gff, int wordCount)
00298 {
00299 if (wordCount >= 9)
00300     return TRUE;
00301 else
00302     {
00303     warn("???%s???\n", gff->buf);
00304     warn("Can't handle line %d of %s.\n", 
00305             gff->lineNumber, gff->fileName);
00306     return FALSE;
00307     }
00308 }
00309 
00310 boolean gffReadGenes(struct gff *gff)
00311 /* Read all the gene (as opposed to base) info in file. */
00312 {
00313 int wordCount;
00314 struct gffSegLine seg;
00315 char curGroup[128];
00316 struct gffGene *gene = NULL;
00317 GffIntron *intron = NULL;
00318 GffExon *exon = NULL;
00319 boolean warnedUnknown = FALSE;
00320 boolean isNewGene;
00321 
00322 curGroup[0] = 0; /* Start off with no group */
00323 
00324 /* Line scanning loop. */
00325 for (;;)
00326     {
00327     /* Get next line and parse it into segLine data structure. */
00328     if (!_gffGetLine(gff)) 
00329         break;   /* End of file. */
00330     if (gff->buf[0] == '#')
00331         continue; /* Ignore sharp containing lines. */
00332     wordCount = gffSegLineScan(gff, &seg);
00333     if (wordCount < 9)
00334         continue; /* Ignore blank lines and short ones. */
00335 
00336     /* Make sure that start is less than or equal end. */
00337     if (seg.start > seg.end)
00338         {
00339         warn("start greater than end line %d of %s.\n",
00340                 gff->lineNumber, gff->fileName);
00341         return FALSE;
00342         }
00343 
00344     /* Get the gene we're working on.  First see if
00345      * it's the same as last time around. */
00346     isNewGene = FALSE;
00347     if (strcmp(seg.group, curGroup) != 0)
00348         {
00349         strcpy(curGroup, seg.group);
00350         if ((gene = gffFindGene(gff, seg.group)) == NULL)
00351             {
00352             /* It's a new gene! */
00353             if (!checkWordCount(gff, wordCount)) return FALSE;
00354             isNewGene = TRUE;
00355             gene = gffNeedMem(gff, sizeof(*gene));
00356             strcpy(gene->name, seg.group);
00357             slAddTail(&gff->genes, gene); 
00358             gene->strand = seg.strand[0];
00359             gene->frame = atoi(seg.frame);
00360             if (differentWord(seg.feature, "CDS") == 0)
00361                 {
00362                 gene->start = seg.start-1;
00363                 gene->end = seg.end-1;
00364                 }
00365             }
00366         }
00367 
00368     /* Look at what sort of feature it is, and decide what to do. */
00369 
00370     if (differentWord(seg.feature, "CDS")==0)
00371         {
00372         /* CDS (coding segments) have been processed already
00373          * for the most part. Here just make sure they aren't
00374          * duplicated. */
00375         if (!checkWordCount(gff, wordCount)) return FALSE;
00376         if (!isNewGene)
00377             {
00378             if (gene->start != 0 || gene->end != 0)
00379                 {
00380                 warn("Warning duplicate CDS for %s\n",
00381                         seg.group);
00382                 warn("Line %d of %s\n", 
00383                         gff->lineNumber, gff->fileName);
00384                 }
00385             }
00386         }
00387     else if (differentWord(seg.feature, "SE") == 0 
00388         ||   differentWord(seg.feature, "IE") == 0
00389         ||   differentWord(seg.feature, "FE") == 0
00390         ||   differentWord(seg.feature, "E") == 0
00391         ||   differentWord(seg.feature, "exon") == 0)
00392         {
00393         /* It's some sort of exon.  We'll deal with the complications
00394          * of it being possibly on the minus strand later, so can
00395          * tread initial, final, single, and regular exons the same
00396          * here. */
00397         if (!checkWordCount(gff, wordCount)) return FALSE;
00398         exon = gffNeedMem(gff, sizeof(*exon));
00399         exon->start = seg.start-1;
00400         exon->end = seg.end-1;
00401         exon->frame = atoi(seg.frame);
00402         gffSegmentInsertSort(&gene->exons, exon);
00403         }
00404     else if (differentWord(seg.feature, "I") == 0 
00405         ||   differentWord(seg.feature, "intron") == 0)
00406         {
00407         /* It's an intron. */
00408         if (!checkWordCount(gff, wordCount)) return FALSE;
00409         intron = gffNeedMem(gff, sizeof(*intron));
00410         intron->start = seg.start-1;
00411         intron->end = seg.end-1;
00412         intron->frame = atoi(seg.frame);
00413         gffSegmentInsertSort(&gene->introns, intron);
00414         }
00415     else if (strcmp(seg.feature, "IG")  == 0)
00416         {
00417         /* I don't know what it is, but we can ignore it. */
00418         }
00419     else
00420         {
00421         if (!warnedUnknown)
00422             {
00423             warn("Unknown feature %s line %d of %s, ignoring\n",
00424                     seg.feature,  gff->lineNumber, gff->fileName);
00425             warnedUnknown = TRUE;
00426             }
00427         }
00428     }
00429 
00430 /* Fix up gene length from exons if needed. */
00431 for (gene = gff->genes; gene != NULL; gene = gene->next)
00432     {
00433     if (gene->start >= gene->end)
00434         {
00435         offsetsFromExons(gene);
00436         }
00437     }
00438 return TRUE;
00439 }
00440 
00441 static boolean geneDna(struct gff *gff, struct gffGene *gene,
00442     int leftExtra, int rightExtra, char **retDna, long *retDnaSize,
00443     int *retStartOffset)
00444 /* Allocate an array and fill it with dna from a gene. */
00445 {
00446 char *dna;
00447 char *pt;
00448 long geneSize;
00449 long i;
00450 long seqStart, seqEnd, seqSize;
00451 
00452 /* Filter out unreasonable looking genes - input to this
00453  * program isn't totally clean. */
00454 geneSize = gene->end - gene->start + 1;
00455 if (geneSize <= 0 || geneSize >= 1000000)
00456     return FALSE;  
00457 
00458 /* Figure out extents of DNA we're going to return.
00459  * Return extra they ask for if possible, but clip
00460  * it to what is actually in GFF file. */
00461 seqStart = gene->start - leftExtra;
00462 seqEnd = gene->end + rightExtra + 1;
00463 if (seqStart < 0)
00464     seqStart = 0;
00465 if (seqEnd > gff->dnaSize)
00466     seqEnd = gff->dnaSize;
00467 seqSize = seqEnd - seqStart;
00468 
00469 /* Allocate memory and fetch the dna. */
00470 dna = needMem(seqSize+1);
00471 pt = dna;
00472 for (i=0; i<seqSize; i++)
00473     *pt++ = gff->dna[seqStart+i];
00474 *pt = 0;
00475 
00476 /* Report results back to caller. */
00477 *retDna = dna;
00478 *retDnaSize = seqSize;
00479 *retStartOffset = (gene->start - seqStart);
00480 return TRUE;
00481 }
00482 
00483 static void fixDirectionAndOffsets(struct gffGene *gene, char *dna, long dnaSize, int newStart)
00484 /* Reverse complement DNA if gene is on negative strand.
00485  * Update offsets of exons and introns in gene to 
00486  * make them point into dna, rather than into gff->dna. 
00487  */
00488 {
00489 long oldStart;
00490 long offset;
00491 GffIntron *intron;
00492 GffExon *exon;
00493 long temp;
00494 
00495 oldStart = gene->start;
00496 offset = oldStart - newStart;
00497 gene->start -= offset;
00498 gene->end -= offset;
00499 for (intron = gene->introns; intron != NULL; intron = intron->next)
00500     {
00501     intron->start -= offset;
00502     intron->end -= offset;
00503     }
00504 for (exon = gene->exons; exon != NULL; exon = exon->next)
00505     {
00506     exon->start -= offset;
00507     exon->end -= offset;
00508     }
00509 if (gene->strand == '-')
00510     {
00511     reverseComplement(dna, dnaSize);
00512     temp = reverseOffset(gene->start, dnaSize);
00513     gene->start = reverseOffset(gene->end, dnaSize);
00514     gene->end = temp;
00515     for (intron = gene->introns; intron != NULL; intron = intron->next)
00516         {
00517         temp = reverseOffset(intron->start, dnaSize);
00518         intron->start = reverseOffset(intron->end, dnaSize);
00519         intron->end = temp;
00520         }
00521     for (exon = gene->exons; exon != NULL; exon = exon->next)
00522         {
00523         temp = reverseOffset(exon->start, dnaSize);
00524         exon->start = reverseOffset(exon->end, dnaSize);
00525         exon->end = temp;
00526         }
00527     slReverse(&gene->introns);
00528     slReverse(&gene->exons);
00529     gene->strand = '+';
00530     }
00531 }
00532 
00533 static struct gffSegment *dupeSegmentList(struct gffSegment *oldList,
00534         struct gffSegment *newList)
00535 /* Duplicate a segment list into array of fresh memory. */
00536 {
00537 struct gffSegment *oldEl, *newEl;
00538 
00539 if (oldList == NULL)
00540     return NULL;
00541 for (oldEl = oldList, newEl = newList; oldEl != NULL; oldEl=oldEl->next, newEl += 1)
00542     {
00543     memcpy(newEl, oldEl, sizeof(*newEl));
00544     newEl->next = ((oldEl->next == NULL) ? NULL : newEl+1);
00545     }
00546 return newList;
00547 }
00548 
00549 struct gffGene *gffDupeGeneAndSurrounds(struct gff *gff, struct gffGene *oldGene,
00550     int leftExtra, int rightExtra)
00551 /* Make a duplicate of gene with extra DNA around coding region. 
00552  * gffFreeGene it when done. */
00553 /* In a perhaps hair brained scheme to save some cycles,
00554  * the memory allocation of the intron and exon lists
00555  * is shared with that of the gffGene itself. */
00556 {
00557 struct gffGene *g;
00558 int intronCount = slCount(oldGene->introns);
00559 int exonCount = slCount(oldGene->exons);
00560 int memSize = sizeof(*g) + (intronCount + exonCount) * sizeof(struct gffSegment);
00561 char *memPt;
00562 int firstExonOffset;
00563 
00564 
00565 memPt = needMem(memSize);
00566 g = (struct gffGene *)memPt;
00567 memPt += sizeof(*g);
00568 g->exons = (struct gffSegment *)memPt;
00569 memPt += exonCount*sizeof(struct gffSegment);
00570 g->introns = (struct gffSegment *)memPt;
00571 
00572 g->next = NULL;
00573 g->start = oldGene->start;
00574 g->end = oldGene->end;
00575 g->strand = oldGene->strand;
00576 memcpy(g->name, oldGene->name, sizeof(g->name));
00577 g->exons = dupeSegmentList(oldGene->exons, g->exons);
00578 g->introns = dupeSegmentList(oldGene->introns, g->introns);
00579 if (!geneDna(gff, oldGene, leftExtra, rightExtra, 
00580     &g->dna, &g->dnaSize, &firstExonOffset))
00581     {
00582     gffFreeGene(&g);
00583     return NULL;
00584     }
00585 fixDirectionAndOffsets(g, g->dna, g->dnaSize, firstExonOffset);
00586 return g;
00587 }
00588 
00589 struct gffGene *gffDupeGene(struct gff *gff, struct gffGene *oldGene)
00590 /* Make a duplicate of gene (with it's own DNA) */
00591 {
00592 return gffDupeGeneAndSurrounds(gff, oldGene, 0, 0);
00593 }
00594 
00595 struct gffGene *gffGeneWithOwnDna(struct gff *gff, char *geneName)
00596 /* Find gene with given name.  Case sensitive. */
00597 {
00598 struct gffGene *oldGene;
00599 
00600 oldGene = gffFindGeneIgnoreCase(gff, geneName);
00601 if (oldGene == NULL)
00602     return NULL;
00603 return gffDupeGene(gff, oldGene);
00604 }
00605 
00606 void gffFreeGene(struct gffGene **pGene)
00607 /* Free a gene returned with dupeGene or geneWithOwnDna. 
00608  * (You don't want to free the ones returned by findGene,
00609  * they are still owned by the gff.)
00610  */
00611 {
00612 struct gffGene *g = *pGene;
00613 if (g == NULL)
00614     return;
00615 freeMem(g->dna);
00616 freeMem(g);
00617 *pGene = NULL;
00618 }
00619 
00620 struct dnaSeq *gffReadDnaSeq(char *fileName)
00621 /* Open gff file and read DNA sequence from it. */
00622 {
00623 struct gff gff;
00624 struct dnaSeq *seq = NULL;
00625 
00626 if (!gffOpen(&gff, fileName))
00627     return NULL;
00628 if (gffReadDna(&gff))
00629     {
00630     seq = newDnaSeq(gff.dna, gff.dnaSize, gff.dnaName);
00631     gff.dna = NULL;
00632     }
00633 gffClose(&gff);
00634 return seq;
00635 }
00636 
00637 boolean gffOpenAndRead(struct gff *gff, char *fileName)
00638 /* Open up gff file and read everything in it. */
00639 {
00640 if (gffOpen(gff, fileName))
00641     if (gffReadDna(gff))
00642         if (gffReadGenes(gff))
00643             return TRUE;
00644 gffClose(gff);
00645 return FALSE;
00646 }

Generated on Tue Dec 25 18:39:31 2007 for blat by  doxygen 1.5.2