lib/dtdParse.c

Go to the documentation of this file.
00001 /* dtdParse - parse an XML DTD file.  Actually this only
00002  * parses a relatively simple subset of DTD's.  It's still
00003  * useful for autoXml and xmlToSql. */
00004 
00005 #include "common.h"
00006 #include "hash.h"
00007 #include "linefile.h"
00008 #include "dystring.h"
00009 #include "obscure.h"
00010 #include "dtdParse.h"
00011 
00012 
00013 static void syntaxError(struct lineFile *lf)
00014 /* Report syntax error and exit. */
00015 {
00016 errAbort("Syntax error line %d of %s", lf->lineIx, lf->fileName);
00017 }
00018 
00019 static char *needNextWord(char **pLine, struct lineFile *lf)
00020 /* Get next word in line.  Squawk and die if can't find it. */
00021 {
00022 char *word = nextWord(pLine);
00023 if (word == NULL)
00024     errAbort("Missing data line %d of %s", lf->lineIx, lf->fileName);
00025 return word;
00026 }
00027 
00028 void needQuotedString( char *in, char *out, struct lineFile *lf, char **retNext)
00029 /* Grab quoted string starting at in and put it into out.  Advance retNext
00030  * to just past quoted string.  In and out may point to same buffer. */
00031 {
00032 if (!parseQuotedString(in, out, retNext))
00033     errAbort("Missing closing quote line %d of %s", lf->lineIx, lf->fileName);
00034 }
00035 
00036 static boolean isAllUpper(char *s)
00037 /* Return true if all alphabetical letters in string
00038  * are upper case. */
00039 {
00040 char c;
00041 while ((c = *s++) != 0)
00042     {
00043     if (isalpha(c) && !isupper(c))
00044         return FALSE;
00045     }
00046 return TRUE;
00047 }
00048 
00049 static boolean isAllLower(char *s)
00050 /* Return true if all alphabetical letters in string
00051  * are upper case. */
00052 {
00053 char c;
00054 while ((c = *s++) != 0)
00055     {
00056     if (isalpha(c) && !islower(c))
00057         return FALSE;
00058     }
00059 return TRUE;
00060 }
00061 
00062 
00063 static char *mixedCaseName(char *prefix, char *orig)
00064 /* Convert var_like_this or VAR_LIKE_THIS or even
00065  * var-like-this to varLikeThis. */
00066 {
00067 char *mixed;
00068 char *d, *s = orig;
00069 char c;
00070 int prefixLen = strlen(prefix), len;
00071 boolean nextUpper;
00072 boolean allUpper = isAllUpper(orig); 
00073 boolean allLower = isAllLower(orig);
00074 boolean initiallyMixed = (!allUpper && !allLower);
00075 
00076 /* Allocate string big enough for prefix and all. */
00077 len = strlen(orig) + prefixLen;
00078 mixed = d = needMem(len+1);
00079 strcpy(d, prefix);
00080 d += prefixLen;
00081 nextUpper = (prefixLen > 0);
00082 
00083 for (;;)
00084    {
00085    c = *s++;
00086    if (c == '_' || c == '-' || c == ':')
00087        nextUpper = TRUE;
00088    else
00089        {
00090        if (nextUpper)
00091            c = toupper(c);
00092        else if (!initiallyMixed)
00093            c = tolower(c);
00094        nextUpper = FALSE;
00095        *d++ = c;
00096        if (c == 0)
00097            break;
00098        }
00099    }
00100 return mixed;
00101 }
00102 
00103 static struct hash *initialEntityHash()
00104 /* Make an initial entity hash - one that is just made up
00105  * of our built-ins. */
00106 {
00107 struct hash *hash = hashNew(0);
00108 hashAdd(hash, "INTEGER", cloneString("#INT"));
00109 hashAdd(hash, "REAL", cloneString("#FLOAT"));
00110 hashAdd(hash, "INT", cloneString("INT"));
00111 hashAdd(hash, "FLOAT", cloneString("FLOAT"));
00112 return hash;
00113 }
00114 
00115 static struct dtdElement *parseElement(
00116         char *prefix, char *textField, char *line, 
00117         struct hash *elHash, struct lineFile *lf)
00118 /* Parse out <!ELEMENT line after <!ELEMENT. */
00119 {
00120 char *word, *s, *e;
00121 char *words[256];
00122 int wordCount, i;
00123 struct dtdElChild *ec;
00124 struct dtdElement *el;
00125 boolean isOr;
00126 char orCopyCode = '?';
00127 
00128 word = needNextWord(&line, lf);
00129 s = word + strlen(word)-1;
00130 if (s[0] == '>')
00131    *s = 0;
00132 if ((el = hashFindVal(elHash, word)) != NULL)
00133     errAbort("Duplicate element %s line %d and %d of %s", word, el->lineIx, lf->lineIx, lf->fileName);
00134 AllocVar(el);
00135 el->lineIx = lf->lineIx;
00136 hashAddSaveName(elHash, word, el, &el->name);
00137 el->mixedCaseName = mixedCaseName(prefix, el->name);
00138 if (line != NULL && (s = strchr(line, '(')) != NULL)
00139     {
00140     s += 1;
00141     if ((e = strchr(line, ')')) == NULL)
00142         errAbort("Missing ')' line %d of %s", lf->lineIx, lf->fileName);
00143     *e = 0;
00144     isOr = (strchr(s, '|') != NULL);
00145     if (isOr)
00146       {
00147         orCopyCode = *(e+1);
00148         if ((orCopyCode != '+') && (orCopyCode != '*'))
00149           orCopyCode = '?';
00150       }
00151     wordCount = chopString(s, "| ,\t", words, ArraySize(words));
00152     if (wordCount == ArraySize(words))
00153         errAbort("Too many children in list line %d of %s", lf->lineIx, lf->fileName);
00154     for (i=0; i<wordCount; ++i)
00155         {
00156         char *name = words[i];
00157         int len = strlen(name);
00158         char lastC = name[len-1];
00159         if (name[0] == '#')
00160             {
00161             if (isOr)
00162                 errAbort("# character in enumeration not allowed line %d of %s",
00163                    lf->lineIx, lf->fileName);
00164             if (el->textType != NULL)
00165                 errAbort("Multiple types for text between tags line %d of %s", 
00166                         lf->lineIx, lf->fileName);
00167             el->textType = cloneString(name);
00168             }
00169         else
00170             {
00171             AllocVar(ec);
00172             slAddHead(&el->children, ec);
00173             ec->isOr = isOr;
00174             if (isOr)
00175                ec->copyCode = orCopyCode;
00176             else
00177                 {
00178                 if (lastC == '+' || lastC == '?' || lastC == '*')
00179                     {
00180                     ec->copyCode = lastC;
00181                     name[len-1] = 0;
00182                     }
00183                 else
00184                     ec->copyCode = '1';
00185                 }
00186             if (sameString(name, textField))
00187                 errAbort("Name conflict with default text field name line %d of %s", lf->lineIx, lf->fileName);
00188             ec->name = cloneString(name);
00189             }
00190         }
00191     slReverse(&el->children);
00192     }
00193 return el;
00194 }
00195 
00196 static void parseAttribute(char *line, char *textField,
00197         struct hash *elHash, struct lineFile *lf)
00198 /* Parse out <!ATTLIST line after <!ATTLIST. */
00199 {
00200 char *word;
00201 struct dtdAttribute *att;
00202 struct dtdElement *el;
00203 char *e;
00204 
00205 /* Get rid of trailing '>' */
00206 e = strrchr(line, '>');
00207 if (e == NULL)
00208     errAbort("Missing '>' line %d of %s", lf->lineIx, lf->fileName);
00209 *e = 0;
00210 
00211 word = needNextWord(&line, lf);
00212 if ((el = hashFindVal(elHash, word)) == NULL)
00213     errAbort("Undefined %s line %d of %s", word, lf->lineIx, lf->fileName);
00214 word = needNextWord(&line, lf);
00215 if (sameString(word, textField))
00216     errAbort("Name conflict with text field name line %d of %s", lf->lineIx, lf->fileName);
00217 AllocVar(att);
00218 att->name = cloneString(word);
00219 att->mixedCaseName = mixedCaseName("", att->name);
00220 word = needNextWord(&line, lf);
00221 att->type = cloneString(word);
00222 line = skipLeadingSpaces(line);
00223 if (line[0] == '#')
00224     {
00225     word = needNextWord(&line, lf);
00226     if (sameWord("#REQUIRED", word))
00227         att->required = TRUE;
00228     else if (sameWord("#IMPLIED", word))
00229         att->usual = NULL;
00230     else
00231         errAbort("Unknown directive %s line %d of %s", word, lf->lineIx, lf->fileName);
00232     }
00233 else if (line[0] == '\'' || line[0] == '"')
00234     {
00235     word = line;
00236     needQuotedString(word, word, lf, &line);
00237     att->usual = cloneString(word);
00238     }
00239 else
00240     {
00241     word = needNextWord(&line, lf);
00242     att->usual = cloneString(word);
00243     }
00244 slAddTail(&el->attributes, att);
00245 }
00246 
00247 
00248 void parseEntity(struct hash *entityHash, struct hash *predefEntityHash,
00249         char *line, struct lineFile *lf)
00250 /* Parse out an entity and add it to hash.  We'll dodge our predefined entities. */
00251 {
00252 char *percent = needNextWord(&line, lf);
00253 char *name = needNextWord(&line, lf);
00254 char *value = skipLeadingSpaces(line);
00255 if (value[0] != '"')
00256     errAbort("Expecting quoted string at end of ENTITY tag line %d of %s",
00257         lf->lineIx, lf->fileName);
00258 needQuotedString(value, value, lf, &line);
00259 if (!sameString(percent, "%"))
00260     errAbort("Expecting %% after ENTITY tag line %d of %s", lf->lineIx, lf->fileName);
00261 if (hashLookup(predefEntityHash, name) == NULL)
00262 /* We don't want to overwrite the predefined entities.  These are all
00263  * defined to be #PCDATA or CDATA for the benefit of non-UCSC XML tools.
00264  * Internally we map them to #INT/#FLOAT etc. so we can have numbers
00265  * as well as strings in our C structures and relational database tables. */
00266     {
00267     char *oldVal = hashFindVal(entityHash, name);
00268     if (oldVal != NULL)
00269         {
00270         if (!sameString(oldVal, value))
00271             errAbort("Entity %s redefined line %d of %s", name, lf->lineIx, lf->fileName);
00272         }
00273     else
00274         {
00275         hashAdd(entityHash, name, cloneString(value));
00276         }
00277     }
00278 }
00279 
00280 
00281 static void fixupChildRefs(struct dtdElement *elList, struct hash *elHash, char *fileName)
00282 /* Go through all of elements children and make sure that the corresponding
00283  * elements are defined. */
00284 {
00285 struct dtdElement *el, *child;
00286 struct dtdElChild *ec;
00287 for (el = elList; el != NULL; el = el->next)
00288     {
00289     for (ec = el->children; ec != NULL; ec = ec->next)
00290         {
00291         if ((child = hashFindVal(elHash, ec->name)) == NULL)
00292             errAbort("%s's child %s undefined line %d of %s", el->name, ec->name, el->lineIx, fileName);
00293         ec->el = child;
00294         }
00295     }
00296 }
00297 
00298 static char *eatComment(struct lineFile *lf, char *line)
00299 /* Eat possibly multi-line comment.  Return line past end of comment */
00300 {
00301 char *s;
00302 for (;;)
00303     {
00304     if ((s = stringIn("-->", line)) != NULL)
00305         {
00306         line = skipLeadingSpaces(s+3);
00307         if (line[0] == 0)
00308             line = NULL;
00309         return line;
00310         }
00311     if (!lineFileNext(lf, &line, NULL))
00312         return NULL;
00313     }
00314 }
00315 
00316 static void expandEntities(char *s, struct hash *entityHash, struct lineFile *lf,
00317         struct dyString *dest)
00318 /* Copy s into dest, expanding any entity (something in format %name;) 
00319  * by looking it up in entity hash. */
00320 {
00321 char c;
00322 while ((c = *s++) != 0)
00323     {
00324     if (c == '%' && !isspace(s[0]))
00325         {
00326         char *name = s;
00327         char *end = strchr(s, ';');
00328         char *value;
00329         if (end == NULL)
00330             errAbort("Can't find ; after %% to close entity line %d of %s",
00331                 lf->lineIx, lf->fileName);
00332         *end++ = 0;
00333         s = end;
00334         value = hashFindVal(entityHash, name);
00335         if (value == NULL)
00336             errAbort("Entity %%%s; is not defined line %d of %s",
00337                 name, lf->lineIx, lf->fileName);
00338         dyStringAppend(dest, value);
00339         }
00340     else
00341         dyStringAppendC(dest, c);
00342     }
00343 }
00344 
00345 static char *dtdxTag(struct lineFile *lf, struct hash *entityHash,
00346         struct dyString *buf)
00347 /* Return next tag. */
00348 {
00349 char *line;
00350 
00351 /* Skip until get a line that starts with '<' */
00352 if (!lineFileNextReal(lf,  &line))
00353     return NULL;
00354 line = trimSpaces(line);
00355 if (line[0] != '<')
00356     errAbort("Text outside of a tag line %d of %s", lf->lineIx, lf->fileName);
00357 dyStringClear(buf);
00358 for (;;)
00359     {
00360     expandEntities(line, entityHash, lf, buf);
00361     if (buf->string[buf->stringSize-1] == '>')
00362          break;
00363     dyStringAppendC(buf, ' ');
00364     if (!lineFileNext(lf, &line, NULL))
00365         errAbort("End of file %s inside of a tag.", lf->fileName);
00366     line = trimSpaces(line);
00367     }
00368 return buf->string;
00369 }
00370 
00371 void dtdParse(char *fileName, char *prefix, char *textField,
00372         struct dtdElement **retList, struct hash **retHash)
00373 /* Parse out a dtd file into elements that are returned in retList,
00374  * and for your convenience also in retHash (which is keyed by the
00375  * name of the element.  Note that XML element names can include the '-'
00376  * character.  For this and other reasons in addition to the element
00377  * name as it appears in the XML tag, the element has a mixedCaseName
00378  * that strips '-' and '_' chars, and tries to convert the name to
00379  * a mixed-case convention style name.  The prefix if any will be
00380  * prepended to mixed-case names.  The textField is what to name
00381  * the field that contains the letters between tags.  By default
00382  * (if NULL) it is "text." */
00383 {
00384 struct hash *elHash = newHash(8);
00385 struct hash *entityHash = initialEntityHash();
00386 struct hash *predefEntityHash = initialEntityHash();
00387 struct dtdElement *elList = NULL, *el;
00388 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00389 char *line, *word;
00390 struct dyString *buf = dyStringNew(0);
00391 
00392 if (prefix == NULL)
00393     prefix = "";
00394 if (textField == NULL)
00395     textField = "text";
00396 while ((line = dtdxTag(lf, entityHash, buf)) != NULL)
00397     {
00398     line = trimSpaces(line);
00399     if (line == NULL || line[0] == 0 || line[0] == '#')
00400         continue;
00401     if (startsWith("<!--", line))
00402         {
00403         line = eatComment(lf, line);
00404         if (line == NULL)
00405             continue;
00406         }
00407     if (!startsWith("<!", line))
00408         syntaxError(lf);
00409     line += 2;
00410     word = needNextWord(&line, lf);
00411     if (sameWord("ELEMENT", word))
00412         {
00413         el = parseElement(prefix, textField, line, elHash, lf);
00414         slAddHead(&elList, el);
00415         }
00416     else if (sameWord("ATTLIST", word))
00417         {
00418         parseAttribute(line, textField, elHash, lf);
00419         }
00420     else if (sameWord("ENTITY", word))
00421         {
00422         parseEntity(entityHash, predefEntityHash, line, lf);
00423         }
00424     else
00425         {
00426         errAbort("Don't understand %s line %d of %s", word, lf->lineIx, lf->fileName);
00427         }
00428     }
00429 lineFileClose(&lf);
00430 dyStringFree(&buf);
00431 slReverse(&elList);
00432 fixupChildRefs(elList, elHash, fileName);
00433 freeHashAndVals(&entityHash);
00434 freeHashAndVals(&predefEntityHash);
00435 *retHash = elHash;
00436 *retList = elList;
00437 }
00438 
00439 void dtdElementDump(struct dtdElement *el, FILE *f)
00440 /* Dump info on element. */
00441 {
00442 struct dtdElChild *ec;
00443 struct dtdAttribute *att;
00444 fprintf(f, "%s %s (", el->name, el->mixedCaseName);
00445 for (ec = el->children; ec != NULL; ec = ec->next)
00446     {
00447     fprintf(f, "%s", ec->name);
00448     if (ec->copyCode != '1')
00449         fprintf(f, "%c", ec->copyCode);
00450     if (ec->isOr)
00451         fprintf(f, " (isOr)");
00452     if (ec->next != NULL)
00453         fprintf(f, ", ");
00454     }
00455 fprintf(f, ")");
00456 if (el->textType != NULL)
00457     fprintf(f, " (%s)", el->textType);
00458 fprintf(f, "\n");
00459 for (att = el->attributes; att != NULL; att = att->next)
00460     {
00461     fprintf(f, "  %s %s %s %s\n",
00462         att->name, att->type, (att->usual ? att->usual : "n/a"),  
00463         (att->required ? "required" : "optional"));
00464     }
00465 }
00466 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2