lib/tokenizer.c

Go to the documentation of this file.
00001 /* tokenizer - A tokenizer structure that will chop up file into
00002  * tokens.  It is aware of quoted strings and otherwise tends to return
00003  * white-space or punctuated-separated words, with punctuation in
00004  * a separate token.  This is used by autoSql. */
00005 
00006 #include "common.h"
00007 #include "errabort.h"
00008 #include "linefile.h"
00009 #include "tokenizer.h"
00010 
00011 static char const rcsid[] = "$Id: tokenizer.c,v 1.3 2004/07/14 05:47:14 kent Exp $";
00012 
00013 struct tokenizer *tokenizerOnLineFile(struct lineFile *lf)
00014 /* Create a new tokenizer on open lineFile. */
00015 {
00016 struct tokenizer *tkz;
00017 AllocVar(tkz);
00018 tkz->sAlloc = 128;
00019 tkz->string = needMem(tkz->sAlloc);
00020 tkz->lf = lf;
00021 tkz->curLine = tkz->linePt = "";
00022 return tkz;
00023 }
00024 
00025 struct tokenizer *tokenizerNew(char *fileName)
00026 /* Return a new tokenizer. */
00027 {
00028 return tokenizerOnLineFile(lineFileOpen(fileName, TRUE));
00029 }
00030 
00031 void tokenizerFree(struct tokenizer **pTkz)
00032 /* Tear down a tokenizer. */
00033 {
00034 struct tokenizer *tkz;
00035 if ((tkz = *pTkz) != NULL)
00036     {
00037     freeMem(tkz->string);
00038     lineFileClose(&tkz->lf);
00039     freez(pTkz);
00040     }
00041 }
00042 
00043 void tokenizerReuse(struct tokenizer *tkz)
00044 /* Reuse token. */
00045 {
00046 tkz->reuse = TRUE;
00047 }
00048 
00049 int tokenizerLineCount(struct tokenizer *tkz)
00050 /* Return line of current token. */
00051 {
00052 return tkz->lf->lineIx;
00053 }
00054 
00055 char *tokenizerFileName(struct tokenizer *tkz)
00056 /* Return name of file. */
00057 {
00058 return tkz->lf->fileName;
00059 }
00060 
00061 char *tokenizerNext(struct tokenizer *tkz)
00062 /* Return token's next string (also available as tkz->string) or
00063  * NULL at EOF. */
00064 {
00065 char *start, *end;
00066 char c, *s;
00067 int size;
00068 if (tkz->reuse)
00069     {
00070     tkz->reuse = FALSE;
00071     return tkz->string;
00072     }
00073 for (;;)        /* Skip over white space and comments. */
00074     {
00075     int lineSize;
00076     s = start = skipLeadingSpaces(tkz->linePt);
00077     if ((c = start[0]) != 0)
00078         {
00079         if (tkz->uncommentC && c == '/')
00080              {
00081              if (start[1] == '/')
00082                  ;  /* Keep going in loop effectively ignoring rest of line. */
00083              else if (start[1] == '*')
00084                  {
00085                  start += 2;
00086                  for (;;)
00087                      {
00088                      char *end = stringIn("*/", start);
00089                      if (end != NULL)
00090                           {
00091                           tkz->linePt = end+2;
00092                           break;
00093                           }
00094                      if (!lineFileNext(tkz->lf, &tkz->curLine, &lineSize))
00095                           errAbort("End of file (%s) in comment", tokenizerFileName(tkz));
00096                      start = tkz->curLine;
00097                      }
00098                  continue;
00099                  }
00100              else
00101                  break;
00102              }
00103         else if (tkz->uncommentShell && c == '#')
00104              ;  /* Keep going in loop effectively ignoring rest of line. */
00105         else
00106             break;      /* Got something real. */
00107         }
00108     if (!lineFileNext(tkz->lf, &tkz->curLine, &lineSize))
00109         {
00110         tkz->eof = TRUE;
00111         return NULL;
00112         }
00113     tkz->linePt = tkz->curLine;
00114     }
00115 if (isalnum(c) || (c == '_'))
00116     {
00117     for (;;)
00118         {
00119         s++;
00120         if (!(isalnum(*s) || (*s == '_')))
00121             break;
00122         }
00123     end = s;
00124     }
00125 else if (c == '"' || c == '\'')
00126     {
00127     char quot = c;
00128     if (tkz->leaveQuotes)
00129         start = s++;
00130     else
00131         start = ++s;
00132     for (;;)
00133         {
00134         c = *s;
00135         if (c == quot)
00136             {
00137             if (s[-1] == '\\')
00138                 {
00139                 if (s >= start+2 && s[-2] == '\\')
00140                     break;
00141                 }
00142             else
00143                 break;
00144             }
00145         else if (c == 0)
00146             {
00147             break;
00148             }
00149         ++s;
00150         }
00151     end = s;
00152     if (c != 0)
00153         ++s;
00154     if (tkz->leaveQuotes)
00155         end += 1;
00156     }
00157 else
00158     {
00159     end = ++s;
00160     }
00161 tkz->linePt = s;
00162 size = end - start;
00163 if (size >= tkz->sAlloc)
00164     {
00165     tkz->sAlloc = size+128;
00166     tkz->string = needMoreMem(tkz->string, 0, tkz->sAlloc);
00167     }
00168 memcpy(tkz->string, start, size);
00169 tkz->string[size] = 0;
00170 return tkz->string;
00171 }
00172 
00173 
00174 void tokenizerErrAbort(struct tokenizer *tkz, char *format, ...)
00175 /* Print error message followed by file and line number and
00176  * abort. */
00177 {
00178 va_list args;
00179 va_start(args, format);
00180 vaWarn(format, args);
00181 errAbort("line %d of %s:\n%s", 
00182         tokenizerLineCount(tkz), tokenizerFileName(tkz), tkz->curLine);
00183 }
00184 
00185 void tokenizerNotEnd(struct tokenizer *tkz)
00186 /* Squawk if at end. */
00187 {
00188 if (tkz->eof)
00189     errAbort("Unexpected end of file");
00190 }
00191 
00192 void tokenizerMustHaveNext(struct tokenizer *tkz)
00193 /* Get next token, which must be there. */
00194 {
00195 if (tokenizerNext(tkz) == NULL)
00196     errAbort("Unexpected end of file");
00197 }
00198 
00199 void tokenizerMustMatch(struct tokenizer *tkz, char *string)
00200 /* Require next token to match string.  Return next token
00201  * if it does, otherwise abort. */
00202 {
00203 if (sameWord(tkz->string, string))
00204     tokenizerMustHaveNext(tkz);
00205 else
00206     tokenizerErrAbort(tkz, "Expecting %s got %s", string, tkz->string);
00207 }
00208 

Generated on Tue Dec 25 18:39:32 2007 for blat by  doxygen 1.5.2