lib/emblParse.c

Go to the documentation of this file.
00001 /* Parse EMBL formatted files. EMBL files are basically line
00002  * oriented.  Each line begins with a short (usually two letter)
00003  * type word.  Adjacent lines with the same type are generally
00004  * considered logical extensions of each other.  In many cases
00005  * lines can be considered fields in an EMBL database.  Records
00006  * are separated by lines starting with '//'  Generally lines
00007  * starting with XX are empty and used to make the records more
00008  * human readable.   Here is an example record:
00009  
00010  C  M00001
00011  XX
00012  ID  V$MYOD_01
00013  XX
00014  NA  MyoD
00015  XX
00016  DT  EWI (created); 19.10.92.
00017  DT  ewi (updated); 22.06.95.
00018  XX
00019  PO     A     C     G     T
00020  01     0     0     0     0
00021  02     0     0     0     0
00022  03     1     2     2     0
00023  04     2     1     2     0
00024  05     3     0     1     1
00025  06     0     5     0     0
00026  07     5     0     0     0
00027  08     0     0     4     1
00028  09     0     1     4     0
00029  10     0     0     0     5
00030  11     0     0     5     0
00031  12     0     1     2     2
00032  13     0     2     0     3
00033  14     1     0     3     1
00034  15     0     0     0     0
00035  16     0     0     0     0
00036  17     0     0     0     0
00037  XX
00038  BF  T00526; MyoD                         ; mouse
00039  XX
00040  BA  5 functional elements in 3 genes
00041  XX
00042  XX
00043  //
00044  
00045  */
00046 
00047 #include "common.h"
00048 #include "linefile.h"
00049 #include "hash.h"
00050 #include "emblParse.h"
00051 
00052 static char const rcsid[] = "$Id: emblParse.c,v 1.3 2003/05/06 07:33:42 kate Exp $";
00053 
00054 boolean emblLineGroup(struct lineFile *lf, char type[16], struct dyString *val)
00055 /* Read next line of embl file.  Read line after that too if it
00056  * starts with the same type field. Return FALSE at EOF. */
00057 {
00058 char *line, *word;
00059 int typeLen = 0;
00060 
00061 dyStringClear(val);
00062 while (lineFileNext(lf, &line, NULL))
00063     {
00064     line = skipLeadingSpaces(line);
00065 
00066     /* Parse out first word into type. */
00067     if (isspace(line[0]))
00068         errAbort("embl line that doesn't start with type line %d of %s", 
00069                 lf->lineIx, lf->fileName);
00070     if (typeLen == 0)
00071         {
00072         word = nextWord(&line);
00073         typeLen = strlen(word);
00074         if (typeLen >= 16)
00075             errAbort("Type word at start of line too long for embl file line %d of %s",
00076                 lf->lineIx, lf->fileName);
00077         strcpy(type, word);
00078         }
00079     else if (!startsWith(type, line) || !isspace(line[typeLen]))
00080         {
00081         lineFileReuse(lf);
00082         break;
00083         }
00084     else
00085         {
00086         dyStringAppendC(val, '\n');
00087         word = nextWord(&line);
00088         }
00089 
00090     if (line != NULL)
00091         {
00092         /* Usually have two spaces after type. */
00093         if (isspace(line[0]))
00094            ++line;
00095         if (isspace(line[0]))
00096            ++line;
00097 
00098         /* Append what's rest of line to return value. */
00099         dyStringAppend(val, line);
00100         }
00101     }
00102 return typeLen > 0;
00103 }
00104 
00105 struct hash *emblRecord(struct lineFile *lf)
00106 /* Read next record and return it in hash.   (Free this
00107  * hash with freeHashAndVals.)   Hash is keyed by type
00108  * and has string values. */
00109 {
00110 struct hash *hash = NULL;
00111 char type[16];
00112 struct dyString *val = newDyString(256);
00113 boolean gotEnd = FALSE;
00114 
00115 while (emblLineGroup(lf, type, val))
00116     {
00117     if (hash == NULL)
00118         hash = newHash(7);
00119     if (sameString(type, "//"))
00120         {
00121         gotEnd = TRUE;
00122         break;
00123         }
00124     hashAdd(hash, type, cloneString(val->string));
00125     }
00126 if (hash != NULL && !gotEnd)
00127     warn("Incomplete last record of embl file %s\n", lf->fileName);
00128 return hash;
00129 }
00130 
00131 static void notEmbl(char *fileName)
00132 /* Complain it's not really an EMBL file. */
00133 {
00134 errAbort("%s is not an emblFile", fileName);
00135 }
00136 
00137 struct lineFile *emblOpen(char *fileName, char type[256])
00138 /* Open up embl file, verify format and optionally  return 
00139  * type (VV line).  Close this with lineFileClose(). */
00140 {
00141 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00142 struct hash *hash = emblRecord(lf);
00143 char *vv;
00144 
00145 if (hash == NULL)
00146     notEmbl(fileName);
00147 if ((vv = hashFindVal(hash, "VV")) == NULL)
00148     notEmbl(fileName);
00149 if (type != NULL)
00150     {
00151     if (strlen(vv) >= 256)
00152         notEmbl(fileName);
00153     strcpy(type, vv);
00154     }
00155 freeHashAndVals(&hash);
00156 return lf;
00157 }

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2