lib/xp.c

Go to the documentation of this file.
00001 /* xp - A minimal non-verifying xml parser.  It's
00002  * stream oriented much like expas.  It's a bit faster
00003  * and smaller than expas.  I'm not sure it handles unicode
00004  * as well.
00005  *
00006  * This file is copyright 2002-2005 Jim Kent, but license is hereby
00007  * granted for all use - public, private or commercial. */
00008 
00009 #include "common.h"
00010 #include "dystring.h"
00011 #include "errabort.h"
00012 #include "hash.h"
00013 #include "xp.h"
00014 #include "xmlEscape.h"
00015 
00016 static char const rcsid[] = "$Id: xp.c,v 1.15 2005/12/19 17:51:14 kent Exp $";
00017 
00018 
00019 char xpNextBuf(struct xp *xp)
00020 /* Fetch a new buffer and return first char.  Return 0 at EOF. */
00021 {
00022 int size = xp->read(xp->userData, xp->inBuf, sizeof(xp->inBuf));
00023 if (size <= 0)
00024     return 0;
00025 xp->inBufEnd = xp->inBuf + size;
00026 xp->in = xp->inBuf+1;
00027 return xp->inBuf[0];
00028 }
00029 
00030 #define xpGetChar(xp) \
00031     (xp->in < xp->inBufEnd ? *xp->in++ : xpNextBuf(xp))
00032 /* Macro to quickly fetch next char. */
00033 
00034 #define xpUngetChar(xp) \
00035     (--xp->in)
00036 /* Oops, don't fetch that after all. */
00037 
00038 struct xp *xpNew(void *userData, 
00039    void (*atStartTag)(void *userData, char *name, char **atts),
00040    void (*atEndTag)(void *userData, char *name, char *text),
00041    int (*read)(void *userData, char *buf, int bufSize),
00042    char *fileName)
00043 /* Form a new xp parser.  File name may be NULL - just used for
00044  * error reporting. */
00045 {
00046 struct xp *xp;
00047 AllocVar(xp);
00048 xp->stack = xp->stackBufEnd = xp->stackBuf + ArraySize(xp->stackBuf);
00049 xp->userData = userData;
00050 xp->atStartTag = atStartTag;
00051 xp->atEndTag = atEndTag;
00052 xp->read = read;
00053 xp->lineIx = 1;
00054 xp->endTag = newDyString(64);
00055 if (fileName)
00056     xp->fileName = cloneString(fileName);
00057 else
00058     xp->fileName = cloneString("XML");
00059 xp->inBufEnd = xp->in = xp->inBuf;              
00060 xp->symHash = xmlEscapeSymHash();
00061 return xp;
00062 }
00063 
00064 int xpReadFromFile(void *userData, char *buf, int bufSize)
00065 /* Read some text assuming a file was passed in as user data. */
00066 {
00067 FILE *f = userData;
00068 return fread(buf, 1, bufSize, f);
00069 }
00070 
00071 
00072 
00073 void xpFree(struct xp **pXp)
00074 /* Free up an xp parser. */
00075 {
00076 int i;
00077 struct xp *xp = *pXp;
00078 if (xp != NULL)
00079     {
00080     struct xpStack *stack;
00081     for (stack = xp->stackBufEnd; --stack >= xp->stackBuf; )
00082         {
00083         if (stack->tag == NULL)
00084             break;
00085         freeDyString(&stack->tag);
00086         freeDyString(&stack->text);
00087         }
00088     for (i=0; i<ArraySize(xp->attDyBuf); ++i)
00089         {
00090         if (xp->attDyBuf[i] == NULL)
00091             break;
00092         freeDyString(&xp->attDyBuf[i]);
00093         }
00094     freeDyString(&xp->endTag);
00095     freeMem(xp->fileName);
00096     hashFree(&xp->symHash);
00097     freez(pXp);
00098     }
00099 }
00100 
00101 int xpLineIx(struct xp *xp)
00102 /* Return current line number. */
00103 {
00104 return xp->lineIx;
00105 }
00106 
00107 char *xpFileName(struct xp *xp)
00108 /* Return current file name. */
00109 {
00110 return xp->fileName;
00111 }
00112 
00113 void xpError(struct xp *xp, char *format, ...)
00114 /* Output an error message with filename and line number included. */
00115 {
00116 va_list args;
00117 va_start(args, format);
00118 vaWarn(format, args);
00119 errAbort("line %d of %s", xpLineIx(xp), xpFileName(xp));
00120 va_end(args);
00121 }
00122 
00123 static void xpUnexpectedEof(struct xp *xp)
00124 /* Squawk and die about EOF. */
00125 {
00126 xpError(xp, "Unexpected end of file.");
00127 }
00128 
00129 static void xpEatComment(struct xp *xp, char commentC)
00130 /* Skip characters until comment end. */
00131 {
00132 int startLine = xp->lineIx;
00133 char lastC = 0;
00134 char c;
00135 for (;;)
00136     {
00137     if ((c = xpGetChar(xp)) == 0)
00138         xpError(xp, "End of file in comment that started line %d", startLine);
00139     if (c == '\n')
00140         ++xp->lineIx;
00141     if (c == '>')
00142         {
00143         if (lastC == commentC || commentC == '!')
00144         break;
00145         }
00146     lastC = c;
00147     }
00148 }
00149 
00150 static void xpLookup(struct xp *xp, struct dyString *temp, struct dyString *text)
00151 /* Parse after '&' until ';' and look up symbol.  Put result into text. */
00152 {
00153 char c;
00154 char *s;
00155 dyStringClear(temp);
00156 for (;;)
00157     {
00158     if ((c = xpGetChar(xp)) == 0)
00159         xpError(xp, "End of file in after & and before ;");
00160     if (isspace(c))
00161         xpError(xp, "& without ;");
00162     if (c == ';')
00163         break;
00164     dyStringAppendC(temp, c);
00165     }
00166 s = temp->string;
00167 if (s[0] == '#')
00168     {
00169     c = atoi(s+1);
00170     dyStringAppendC(text, c);
00171     }
00172 else if ((s = hashFindVal(xp->symHash, s)) == NULL)
00173     {
00174     dyStringAppendC(text, '&');
00175     dyStringAppend(text, temp->string);
00176     dyStringAppendC(text, ';');
00177     }
00178 else
00179     {
00180     dyStringAppend(text, s);
00181     }
00182 }
00183 
00184 void xpForceMatch(struct xp *xp, char *matchString)
00185 /* Make sure that the next characters are match, and eat them. */
00186 {
00187 char *match = matchString, m;
00188 while ((m = *match++) != 0)
00189     {
00190     if (m != xpGetChar(xp))
00191         xpError(xp, "Expecting %s", matchString);
00192     }
00193 }
00194 
00195 void xpTextUntil(struct xp *xp, char *endPattern)
00196 /* Stuff xp->text with everything up to endPattern. */
00197 {
00198 int endSize = strlen(endPattern);
00199 int endPos = 0;
00200 char c;
00201 struct dyString *dy = xp->stack->text;
00202 for (;;)
00203     {
00204     if ((c = xpGetChar(xp)) == 0)
00205         xpUnexpectedEof(xp);
00206     if (c == endPattern[endPos])
00207         {
00208         endPos += 1;
00209         if (endPos == endSize)
00210             return;
00211         }
00212     else
00213         {
00214         if (endPos > 0)
00215             dyStringAppendN(dy, endPattern, endPos);
00216         dyStringAppendC(dy, c);
00217         endPos = 0;
00218         }
00219     }
00220 }
00221 
00222 
00223 void xpParseStartTag(struct xp *xp, 
00224         int maxAttCount,                  /* Maximum attribute count. */
00225         struct dyString *retName,         /* Returns tag name */
00226         int *retAttCount,                 /* Returns attribute count. */
00227         struct dyString **retAttributes,  /* Name, value, name, value... */
00228         boolean *retClosed)       /* If true then is self-closing (ends in />) */
00229 /* Call this after the first '<' in a tag has been read.  It'll
00230  * parse out until the '>' tag. */
00231 {
00232 char c, quotC;
00233 int attCount = 0;
00234 struct dyString *dy;
00235 int lineStart;
00236 
00237 dyStringClear(retName);
00238 
00239 /* Skip white space after '<' and before tag name. */
00240 for (;;)
00241     {
00242     if ((c = xpGetChar(xp)) == 0)
00243         xpUnexpectedEof(xp);
00244     if (isspace(c))
00245         {
00246         if (c == '\n')
00247             ++xp->lineIx;
00248         }
00249     else
00250         break;
00251     }
00252 
00253 /* Read in tag name. */
00254 for (;;)
00255     {
00256     dyStringAppendC(retName, c);
00257     if ((c = xpGetChar(xp)) == 0)
00258         xpUnexpectedEof(xp);
00259     if (c == '>' || c == '/' || isspace(c))
00260         break;
00261     }
00262 if (c == '\n')
00263     ++xp->lineIx;
00264 
00265 /* Parse attributes. */
00266 if (c != '>' && c != '/')
00267     {
00268     for (;;)
00269         {
00270         /* Skip leading white space. */
00271         for (;;)
00272             {
00273             if ((c = xpGetChar(xp)) == 0)
00274                 xpUnexpectedEof(xp);
00275             if (isspace(c))
00276                 {
00277                 if (c == '\n')
00278                     ++xp->lineIx;
00279                 }
00280             else
00281                 break;
00282             }
00283         if (c == '>' || c == '/')
00284             break;
00285 
00286         /* Allocate space in attribute table. */
00287         if (attCount >= maxAttCount - 2)
00288             xpError(xp, "Attribute stack overflow");
00289         dy = retAttributes[attCount];
00290         if (dy == NULL)
00291             dy = retAttributes[attCount] = newDyString(64);
00292         else
00293             dyStringClear(dy);
00294         ++attCount;
00295 
00296         /* Read until not a label character. */
00297         for (;;)
00298             {
00299             dyStringAppendC(dy, c);
00300             if ((c = xpGetChar(xp)) == 0)
00301                 xpUnexpectedEof(xp);
00302             if (isspace(c))
00303                 {
00304                 if (c == '\n')
00305                     ++xp->lineIx;
00306                 break;
00307                 }
00308             if (c == '=')
00309                 break;
00310             if (c == '/' || c == '>')
00311                 xpError(xp, "Expecting '=' after attribute name");
00312             }
00313 
00314         /* Skip white space until '=' */
00315         if (c != '=')
00316             {
00317             for (;;)
00318                 {
00319                 if ((c = xpGetChar(xp)) == 0)
00320                     xpUnexpectedEof(xp);
00321                 if (isspace(c))
00322                     {
00323                     if (c == '\n')
00324                         ++xp->lineIx;
00325                     }
00326                 else
00327                     break;
00328                 }
00329             if (c != '=')
00330                 xpError(xp, "Expecting '=' after attribute name");
00331             }
00332 
00333         /* Skip space until quote. */
00334         for (;;)
00335             {
00336             if ((c = xpGetChar(xp)) == 0)
00337                 xpUnexpectedEof(xp);
00338             else if (isspace(c))
00339                 {
00340                 if (c == '\n')
00341                     ++xp->lineIx;
00342                 }
00343             else
00344                 break;
00345             }
00346         if (c != '\'' && c != '"')
00347             xpError(xp, "Expecting quoted string after =");
00348 
00349         /* Allocate space in attribute table. */
00350         if (attCount >= maxAttCount - 2)
00351             xpError(xp, "Attribute stack overflow");
00352         dy = retAttributes[attCount];
00353         if (dy == NULL)
00354             dy = retAttributes[attCount] = newDyString(64);
00355         else
00356             dyStringClear(dy);
00357         ++attCount;
00358 
00359         /* Read until next quote. */
00360         quotC = c;
00361         lineStart = xp->lineIx;
00362         for (;;)
00363             {
00364             if ((c = xpGetChar(xp)) == 0)
00365                xpError(xp, "End of file inside literal string that started at line %d", lineStart);
00366             if (c == quotC)
00367                 break;
00368             if (c == '&')
00369                xpLookup(xp, xp->endTag, dy);
00370             else
00371                 {
00372                 if (c == '\n')
00373                     ++xp->lineIx;
00374                 dyStringAppendC(dy, c);
00375                 }
00376             }
00377         }
00378     }
00379 if (c == '/')
00380     {
00381     *retClosed = TRUE;
00382     c = xpGetChar(xp);
00383     if (c != '>')
00384         xpError(xp, "Expecting '>' after '/'");
00385     }
00386 else
00387     *retClosed = FALSE;
00388 *retAttCount = attCount;
00389 }
00390 
00391 void xpParseEndTag(struct xp *xp, char *tagName)
00392 /* Call this after have seen </.  It will parse through
00393  * > and make sure that the tagName matches. */
00394 {
00395 struct dyString *dy = xp->endTag;
00396 char c;
00397 
00398 dyStringClear(dy);
00399 
00400 /* Skip leading space. */
00401 for (;;)
00402     {
00403     if ((c = xpGetChar(xp)) == 0)
00404         xpUnexpectedEof(xp);
00405     if (isspace(c))
00406         {
00407         if (c == '\n')
00408             ++xp->lineIx;
00409         }
00410     else
00411         break;
00412     }
00413 
00414 /* Read end tag. */
00415 for (;;)
00416     {
00417     dyStringAppendC(dy, c);
00418     if ((c = xpGetChar(xp)) == 0)
00419         xpUnexpectedEof(xp);
00420     if (isspace(c))
00421         {
00422         if (c == '\n')
00423             ++xp->lineIx;
00424         break;
00425         }
00426     if (c == '>')
00427         break;
00428     }
00429 
00430 /* Skip until '>' */
00431 while (c != '>')
00432     {
00433     dyStringAppendC(dy, c);
00434     if ((c = xpGetChar(xp)) == 0)
00435         xpUnexpectedEof(xp);
00436     if (isspace(c))
00437         {
00438         if (c == '\n')
00439             ++xp->lineIx;
00440         }
00441     else if (c != '>')
00442         xpError(xp, "Unexpected characters past first word in /%s tag", dy->string);
00443     }
00444 
00445 if (!sameString(dy->string, tagName))
00446     xpError(xp, "Mismatch between start tag %s and end tag %s",  tagName, dy->string);
00447 }
00448 
00449 boolean xpParseNext(struct xp *xp, char *tag)
00450 /* Skip through file until get given tag.  Then parse out the
00451  * tag and all of it's children (calling atStartTag/atEndTag).
00452  * You can call this repeatedly to process all of a given tag
00453  * in file. */
00454 
00455 {
00456 char c;
00457 int i, attCount = 0;
00458 struct dyString *text = NULL;
00459 boolean isClosed;
00460 boolean inside = (tag == NULL);
00461 struct xpStack *initialStack = xp->stack;
00462 
00463 for (;;)
00464     {
00465     /* Load up text until next tag. */
00466     for (;;)
00467         {
00468         if ((c = xpGetChar(xp)) == 0)
00469             return FALSE;
00470         if (c == '<')
00471             break;
00472         if (c == '&')
00473            xpLookup(xp, xp->endTag, text);
00474         else 
00475             {
00476             if (c == '\n')
00477                 ++xp->lineIx;
00478             if (text != NULL)
00479                 dyStringAppendC(text, c);
00480             }
00481         }
00482 
00483     /* Get next character to figure out what type of tag. */
00484     c = xpGetChar(xp);
00485     if (c == 0)
00486        xpError(xp, "End of file inside tag");
00487     else if (c == '?' || c == '!')
00488         xpEatComment(xp, c);
00489     else if (c == '/')  /* Closing tag. */
00490         {
00491         struct xpStack *stack = xp->stack;
00492         if (stack >= xp->stackBufEnd)
00493             xpError(xp, "Extra end tag");
00494         xpParseEndTag(xp, stack->tag->string);
00495         if (inside)
00496             xp->atEndTag(xp->userData, stack->tag->string, stack->text->string);
00497         xp->stack += 1;
00498         if (xp->stack == initialStack)
00499             return TRUE;
00500         }
00501     else        /* Start tag. */
00502         {
00503         /* Push new frame on stack and check for overflow and unallocated strings. */
00504         struct xpStack *stack = --xp->stack;
00505         if (stack < xp->stackBuf)
00506             xpError(xp, "Stack overflow");
00507         if (stack->tag == NULL)
00508             stack->tag = newDyString(32);
00509         else
00510             dyStringClear(stack->tag);
00511         if (stack->text == NULL)
00512             stack->text = newDyString(256);
00513         else
00514             dyStringClear(stack->text);
00515         text = stack->text;
00516 
00517         /* Parse the start tag. */
00518         xpUngetChar(xp);
00519         xpParseStartTag(xp, ArraySize(xp->attDyBuf), stack->tag, 
00520                 &attCount, xp->attDyBuf, &isClosed);
00521 
00522         if (!inside && sameString(stack->tag->string, tag))
00523             {
00524             inside = TRUE;
00525             initialStack = xp->stack + 1;
00526             }
00527 
00528         /* Call user start function, and if closed tag, end function too. */
00529         if (inside)
00530             {
00531             /* Unpack attributes into simple array of strings. */
00532             for (i=0; i<attCount; ++i)
00533                 xp->attBuf[i] = xp->attDyBuf[i]->string;
00534             xp->attBuf[attCount] = NULL;
00535             xp->atStartTag(xp->userData, stack->tag->string, xp->attBuf);
00536             }
00537         if (isClosed)
00538             {
00539             if (inside)
00540                 xp->atEndTag(xp->userData, stack->tag->string, stack->text->string);
00541             xp->stack += 1;
00542             if (xp->stack == initialStack)
00543                 return TRUE;
00544             }
00545         }
00546     }
00547 }
00548 
00549 void xpParse(struct xp *xp)
00550 /* Parse from start tag to end tag.  Throw error if a problem. */
00551 {
00552 xpParseNext(xp, NULL);
00553 }
00554 

Generated on Tue Dec 25 18:39:32 2007 for blat by  doxygen 1.5.2