lib/htmlPage.c

Go to the documentation of this file.
00001 /* htmlPage - stuff to read, parse, and submit  htmlPages and forms. 
00002  *
00003  * typical usage is:
00004  *   struct htmlPage *page = htmlPageGet(url);
00005  *   htmlPageValidateOrAbort(page);
00006  *   var = htmlPageGetVar(page, page->forms, "org");
00007  *   if (var != NULL)
00008  *      printf("Organism = var->org);
00009  *   htmlPageSetVar(page, page->forms, "org", "Human");
00010  *   newPage = htmlPageFromForm(page, page->forms, "submit", "Go");
00011  */
00012 
00013 #include "common.h"
00014 #include "errabort.h"
00015 #include "errCatch.h"
00016 #include "memalloc.h"
00017 #include "linefile.h"
00018 #include "hash.h"
00019 #include "dystring.h"
00020 #include "cheapcgi.h"
00021 #include "obscure.h"
00022 #include "filePath.h"
00023 #include "net.h"
00024 #include "htmlPage.h"
00025 
00026 static char const rcsid[] = "$Id: htmlPage.c,v 1.32 2006/07/29 00:17:28 galt Exp $";
00027 
00028 void htmlStatusFree(struct htmlStatus **pStatus)
00029 /* Free up resources associated with status */
00030 {
00031 struct htmlStatus *status = *pStatus;
00032 if (status != NULL)
00033     {
00034     freeMem(status->version);
00035     freez(pStatus);
00036     }
00037 }
00038 
00039 void htmlStatusFreeList(struct htmlStatus **pList)
00040 /* Free a list of dynamically allocated htmlStatus's */
00041 {
00042 struct htmlStatus *el, *next;
00043 
00044 for (el = *pList; el != NULL; el = next)
00045     {
00046     next = el->next;
00047     htmlStatusFree(&el);
00048     }
00049 *pList = NULL;
00050 }
00051 
00052 void htmlCookieFree(struct htmlCookie **pCookie)
00053 /* Free memory associated with cookie. */
00054 {
00055 struct htmlCookie *cookie = *pCookie;
00056 if (cookie != NULL)
00057     {
00058     freeMem(cookie->name);
00059     freeMem(cookie->value);
00060     freeMem(cookie->domain);
00061     freeMem(cookie->path);
00062     freeMem(cookie->expires);
00063     freez(pCookie);
00064     }
00065 }
00066 
00067 void htmlCookieFreeList(struct htmlCookie **pList)
00068 /* Free a list of dynamically allocated htmlCookie's */
00069 {
00070 struct htmlCookie *el, *next;
00071 
00072 for (el = *pList; el != NULL; el = next)
00073     {
00074     next = el->next;
00075     htmlCookieFree(&el);
00076     }
00077 *pList = NULL;
00078 }
00079 
00080 struct htmlCookie *htmlCookieFileRead(char *fileName)
00081 /* Read cookies from a line oriented file.  First word in line
00082  * is the cookie name, the rest of the line the cookie value. */
00083 {
00084 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00085 struct htmlCookie *list = NULL, *cookie;
00086 char *line, *word;
00087 while (lineFileNextReal(lf, &line))
00088     {
00089     word = nextWord(&line);
00090     line = skipLeadingSpaces(line);
00091     if (line == NULL)
00092         errAbort("Missing cookie value line %d of %s", lf->lineIx, lf->fileName);
00093     AllocVar(cookie);
00094     cookie->name = cloneString(word);
00095     cookie->value = cloneString(line);
00096     slAddHead(&list, cookie);
00097     }
00098 lineFileClose(&lf);
00099 slReverse(&list);
00100 return list;
00101 }
00102 
00103 static void cookieOutput(struct dyString *dy, struct htmlCookie *cookieList)
00104 /* Write cookies to dy. */
00105 {
00106 struct htmlCookie *cookie;
00107 if (cookieList != NULL)
00108     {
00109     dyStringAppend(dy, "Cookie:");
00110     for (cookie = cookieList; cookie != NULL; cookie = cookie->next)
00111         {
00112         if (cookie != cookieList)
00113             dyStringAppendC(dy, ';');
00114         dyStringAppendC(dy, ' ');
00115         dyStringAppend(dy, cookie->name);
00116         dyStringAppendC(dy, '=');
00117         dyStringAppend(dy, cookie->value);
00118         }
00119     dyStringAppend(dy, "\r\n");
00120     }
00121 }
00122 
00123 
00124 void htmlAttributeFree(struct htmlAttribute **pAttribute)
00125 /* Free up resources associated with attribute. */
00126 {
00127 struct htmlAttribute *att = *pAttribute;
00128 if (att != NULL)
00129     {
00130     freeMem(att->name);
00131     freeMem(att->val);
00132     freez(pAttribute);
00133     }
00134 }
00135 
00136 void htmlAttributeFreeList(struct htmlAttribute **pList)
00137 /* Free a list of dynamically allocated htmlAttribute's */
00138 {
00139 struct htmlAttribute *el, *next;
00140 
00141 for (el = *pList; el != NULL; el = next)
00142     {
00143     next = el->next;
00144     htmlAttributeFree(&el);
00145     }
00146 *pList = NULL;
00147 }
00148 
00149 void htmlTagFree(struct htmlTag **pTag)
00150 /* Free up resources associated with tag. */
00151 {
00152 struct htmlTag *tag = *pTag;
00153 if (tag != NULL)
00154     {
00155     htmlAttributeFreeList(&tag->attributes);
00156     freeMem(tag->name);
00157     freez(pTag);
00158     }
00159 }
00160 
00161 void htmlTagFreeList(struct htmlTag **pList)
00162 /* Free a list of dynamically allocated htmlTag's */
00163 {
00164 struct htmlTag *el, *next;
00165 
00166 for (el = *pList; el != NULL; el = next)
00167     {
00168     next = el->next;
00169     htmlTagFree(&el);
00170     }
00171 *pList = NULL;
00172 }
00173 
00174 void htmlFormVarFree(struct htmlFormVar **pVar)
00175 /* Free up resources associated with form variable. */
00176 {
00177 struct htmlFormVar *var = *pVar;
00178 if (var != NULL)
00179     {
00180     freeMem(var->curVal);
00181     slFreeList(&var->values);
00182     slFreeList(&var->tags);
00183     freez(pVar);
00184     }
00185 }
00186 
00187 void htmlFormVarFreeList(struct htmlFormVar **pList)
00188 /* Free a list of dynamically allocated htmlFormVar's */
00189 {
00190 struct htmlFormVar *el, *next;
00191 
00192 for (el = *pList; el != NULL; el = next)
00193     {
00194     next = el->next;
00195     htmlFormVarFree(&el);
00196     }
00197 *pList = NULL;
00198 }
00199 
00200 
00201 void htmlFormFree(struct htmlForm **pForm)
00202 /* Free up resources associated with form variable. */
00203 {
00204 struct htmlForm *form = *pForm;
00205 if (form != NULL)
00206     {
00207     htmlFormVarFreeList(&form->vars);
00208     freez(pForm);
00209     }
00210 }
00211 
00212 void htmlFormFreeList(struct htmlForm **pList)
00213 /* Free a list of dynamically allocated htmlForm's */
00214 {
00215 struct htmlForm *el, *next;
00216 
00217 for (el = *pList; el != NULL; el = next)
00218     {
00219     next = el->next;
00220     htmlFormFree(&el);
00221     }
00222 *pList = NULL;
00223 }
00224 
00225 void htmlPageFree(struct htmlPage **pPage)
00226 /* Free up resources associated with htmlPage. */
00227 {
00228 struct htmlPage *page = *pPage;
00229 if (page != NULL)
00230     {
00231     freez(&page->url);
00232     htmlStatusFree(&page->status);
00233     freeHashAndVals(&page->header);
00234     htmlCookieFreeList(&page->cookies);
00235     freez(&page->fullText);
00236     htmlTagFreeList(&page->tags);
00237     htmlFormFreeList(&page->forms);
00238     freez(pPage);
00239     }
00240 }
00241 
00242 void htmlPageFreeList(struct htmlPage **pList)
00243 /* Free a list of dynamically allocated htmlPage's */
00244 {
00245 struct htmlPage *el, *next;
00246 
00247 for (el = *pList; el != NULL; el = next)
00248     {
00249     next = el->next;
00250     htmlPageFree(&el);
00251     }
00252 *pList = NULL;
00253 }
00254 
00255 static int findLineNumber(char *start, char *pos)
00256 /* Figure out line number of given position relative to start. */
00257 {
00258 char *s;
00259 int line = 1;
00260 for (s = start; s <= pos; ++s)
00261     {
00262     if (s[0] == '\n')
00263        ++line;
00264     }
00265 return line;
00266 }
00267 
00268 static void tagVaWarn(struct htmlPage *page, struct htmlTag *tag, char *format, 
00269         va_list args)
00270 /* Print warning message and some context of tag. */
00271 {
00272 char context[80];
00273 strncpy(context, tag->start, sizeof(context));
00274 context[sizeof(context)-1] = 0;
00275 warn("Error near line %d of %s:\n %s", findLineNumber(page->htmlText, tag->start), 
00276         page->url, context);
00277 vaWarn(format, args);
00278 }
00279 
00280 static void tagWarn(struct htmlPage *page, struct htmlTag *tag, char *format, ...)
00281 /* Print warning message and some context of tag. */
00282 {
00283 va_list args;
00284 va_start(args, format);
00285 tagVaWarn(page, tag, format, args);
00286 va_end(args);
00287 }
00288 
00289 static void tagAbort(struct htmlPage *page, struct htmlTag *tag, char *format, ...)
00290 /* Print abort message and some context of tag. */
00291 {
00292 va_list args;
00293 va_start(args, format);
00294 tagVaWarn(page, tag, format, args);
00295 va_end(args);
00296 noWarnAbort();
00297 }
00298 
00299 struct htmlStatus *htmlStatusParse(char **pText)
00300 /* Read in status from first line.  Update pText to point to next line. 
00301  * Note unlike many routines here, this does not insert zeros into text. */
00302 {
00303 char *text = *pText;
00304 char *end = strchr(text, '\n');
00305 struct htmlStatus *status;
00306 if (end != NULL)
00307    *pText = end+1;
00308 else
00309    *pText = text + strlen(text);
00310 end = skipToSpaces(text);
00311 if (end == NULL)
00312     {
00313     warn("Short status line.");
00314     return NULL;
00315     }
00316 AllocVar(status);
00317 status->version = cloneStringZ(text, end-text);
00318 end = skipLeadingSpaces(end);
00319 if (!isdigit(end[0]))
00320     {
00321     warn("Not a number in status field");
00322     return NULL;
00323     }
00324 status->status = atoi(end);
00325 return status;
00326 }
00327 
00328 char *htmlNextCrLfLine(char **pS)
00329 /* Return zero-terminated line and advance *pS to start of
00330  * next line.  Return NULL at end of file.  Warn if there is
00331  * no <CR>. */
00332 {
00333 char *s = *pS, *e;
00334 if (s == NULL || s[0] == 0)
00335     return NULL;
00336 e = strchr(s, '\n');
00337 if (e == NULL)
00338     verbose(1, "End of file in header\n");
00339 else 
00340     {
00341     *e = 0;
00342     if (e == s || e[-1] != '\r')
00343         verbose(1, "Missing <CR> in header line\n");
00344     else
00345        e[-1] = 0;
00346     e += 1;
00347     }
00348 *pS = e;
00349 return s;
00350 }
00351 
00352 static void cookieParseNameValuePair(char *s, char **retName, char **retVal)
00353 /* Parse out name/value pair. Warn and return FALSE if there's a problem. */
00354 {
00355 char *val = strchr(s, '=');
00356 if (val == NULL)
00357     {
00358     val = s + strlen(s);
00359     }
00360 *val++ = 0;
00361 *retName = s;
00362 *retVal = val;
00363 }
00364 
00365 static struct htmlCookie *parseCookie(char *s)
00366 /* Parse out cookie line to the right of Set-Cookie. */
00367 {
00368 char *e, *name, *val;
00369 struct htmlCookie *cookie;
00370 
00371 /* Grab up to semicolon, which is the cookie name/value pair. */
00372 e = strchr(s, ';');
00373 if (e == NULL)
00374     {
00375     warn("Missing ';' in cookie");
00376     return NULL;
00377     }
00378 *e++ = 0;
00379 
00380 /* Allocate cookie and fill out name/value pair. */
00381 AllocVar(cookie);
00382 cookieParseNameValuePair(s, &name, &val);
00383 cookie->name = cloneString(name);
00384 cookie->value = cloneString(val);
00385 
00386 /* Loop through to grab the other info - domain and so forth. */
00387 s = e;
00388 for (;;)
00389     {
00390     /* Find next semicolon and zero-terminate it. */
00391     s = skipLeadingSpaces(s);
00392     e = strchr(s, ';');
00393     if (e == NULL)
00394         break;
00395     *e++ = 0;
00396 
00397     /* Parse out name/value pairs and save it away if it's one we know about. */
00398     cookieParseNameValuePair(s, &name, &val);
00399     if (sameString(name, "domain"))
00400         cookie->domain = cloneString(val);
00401     else if (sameString(name, "path"))
00402         cookie->path = cloneString(val);
00403     else if (sameString(name, "expires"))
00404         cookie->expires = cloneString(val);
00405     else if (sameString(name, "secure"))
00406         cookie->secure = TRUE;
00407 
00408     s = e;
00409     }
00410 return cookie;
00411 }
00412 
00413 static struct hash *htmlHeaderRead(char **pHtml, struct htmlCookie **pCookies)
00414 /* Read in from second line through first blank line and
00415  * save in hash.  These lines are in the form name: value. */
00416 {
00417 struct hash *hash = hashNew(6);
00418 for (;;)
00419     {
00420     char *line = htmlNextCrLfLine(pHtml);
00421     char *word;
00422     if (line == NULL)
00423         {
00424         warn("End of file in header");
00425         break;
00426         }
00427     word = nextWord(&line);
00428     if (word == NULL)
00429         break;
00430     line = skipLeadingSpaces(line);
00431     hashAdd(hash, word, cloneString(line));
00432     if (sameString(word, "Set-Cookie:"))
00433         {
00434         struct htmlCookie *cookie = parseCookie(line);
00435         if (cookie != NULL)
00436             slAddTail(pCookies, cookie);
00437         }
00438     }
00439 return hash;
00440 }
00441 
00442 static char *htmlAttributeFindVal(struct htmlAttribute *list, char *name)
00443 /* Find named attribute or return NULL. */
00444 {
00445 struct htmlAttribute *att;
00446 for (att = list; att != NULL; att = att->next)
00447     {
00448     if (sameWord(att->name, name))
00449         return att->val;
00450     }
00451 return NULL;
00452 }
00453 
00454 
00455 char *htmlTagAttributeVal(struct htmlPage *page, struct htmlTag *tag, 
00456         char *name, char *defaultVal)
00457 /* Return value of named attribute, or defaultVal if attribute doesn't exist. */
00458 {
00459 char *val = htmlAttributeFindVal(tag->attributes, name);
00460 if (val == NULL)
00461     val = defaultVal;
00462 return val;
00463 }
00464 
00465 char *htmlTagAttributeNeeded(struct htmlPage *page, struct htmlTag *tag, char *name)
00466 /* Return named tag attribute.  Complain and return "n/a" if it
00467  * doesn't exist. */
00468 {
00469 char *val = htmlTagAttributeVal(page, tag, name, NULL);
00470 if (val == NULL)
00471     {
00472     tagWarn(page, tag, "Missing %s attribute", name);
00473     val = "n/a";
00474     }
00475 return val;
00476 }
00477 
00478 static struct htmlTag *htmlTagScan(char *html, char *dupe)
00479 /* Scan HTML for tags and return a list of them. 
00480  * Html is the text to scan, and dupe is a copy of it
00481  * which this routine will insert 0's in in the course of
00482  * parsing.*/
00483 {
00484 char *s = dupe, c, *e, *tagName;
00485 struct htmlTag *tagList = NULL, *tag;
00486 struct htmlAttribute *att;
00487 int pos;
00488 
00489 for (;;)
00490     {
00491     c = *s++;
00492     if (c == 0)
00493         break;
00494     if (c == '<')
00495         {
00496         if (*s == '!')  /* HTML comment. */
00497             {
00498             s += 1;
00499             if (s[0] == '-' && s[1] == '-')
00500                 s = stringIn("-->", s);
00501             else
00502                 s = strchr(s, '>');
00503             if (s == NULL)
00504                 {
00505                 warn("End of file in comment");
00506                 break;
00507                 }
00508             }
00509         else
00510             {
00511             /* Grab first word into tagName. */
00512             e = s;
00513             for (;;)
00514                 {
00515                 c = *e;
00516                 if (c == '>' || c == 0 || isspace(c))
00517                     break;
00518                 e += 1;
00519                 }
00520             if (c != 0)
00521                *e++ = 0;
00522             tagName = s;
00523             s = e;
00524             
00525             /* Allocate tag, fill in name, and stick it on list. */
00526             AllocVar(tag);
00527             tag->name = cloneString(tagName);
00528             slAddHead(&tagList, tag);
00529             pos = tagName - dupe - 1;
00530             tag->start = html+pos;
00531 
00532             /* If already got end tag (or EOF) stop processing tag. */
00533             if (c == '>' || c == 0)
00534                 {
00535                 tag->end = html + (e - dupe);
00536                 continue;
00537                 }
00538 
00539             /* Process name/value pairs until get end tag. */
00540             for (;;)
00541                 {
00542                 char *name, *val;
00543                 boolean gotEnd = FALSE;
00544 
00545                 /* Check for end tag. */
00546                 s = skipLeadingSpaces(s);
00547                 if (s[0] == '>' || s[0] == 0)
00548                     {
00549                     tag->end = html + (s - dupe);
00550                     if (s[0] == '>')
00551                         tag->end += 1;
00552                     break;
00553                     }
00554 
00555                 /* Get name - everything up to equals. */
00556                 e = s;
00557                 for (;;)
00558                     {
00559                     c = *e;
00560                     if (c == '=')
00561                         break;
00562                     else if (c == '>')
00563                         break;
00564                     else if (c == 0)
00565                         break;
00566                     else if (isspace(c))
00567                         break;
00568                     e += 1;
00569                     }
00570                 if (c == 0)
00571                     {
00572                     warn("End of file in tag");
00573                     break;
00574                     }
00575                 name = s;
00576                 *e++ = 0;
00577                 eraseTrailingSpaces(name);
00578                 if (c == '>')
00579                     {
00580                     val = "";
00581                     gotEnd = TRUE;
00582                     tag->end = html + (e - dupe);
00583                     }
00584                 else if (isspace(c))
00585                     {
00586                     val = "";
00587                     }
00588                 else
00589                     {
00590                     val = e = skipLeadingSpaces(e);
00591                     if (e[0] == '"')
00592                         {
00593                         if (!parseQuotedString(val, val, &e))
00594                             break;
00595                         }
00596                     else
00597                         {
00598                         for (;;)
00599                             {
00600                             c = *e;
00601                             if (c == '>')
00602                                 {
00603                                 gotEnd = TRUE;
00604                                 *e++ = 0;
00605                                 tag->end = html + (e - dupe);
00606                                 break;
00607                                 }
00608                             else if (isspace(c))
00609                                 {
00610                                 *e++ = 0;
00611                                 break;
00612                                 }
00613                             else if (c == 0)
00614                                 break;
00615                             ++e;
00616                             }
00617                         }
00618                     }
00619                 AllocVar(att);
00620                 att->name = cloneString(name);
00621                 att->val = cloneString(val);
00622                 slAddTail(&tag->attributes, att);
00623                 s = e;
00624                 if (gotEnd)
00625                     break;
00626                 }
00627             }
00628         }
00629     }
00630 slReverse(&tagList);
00631 return tagList;
00632 }
00633 
00634 static struct htmlFormVar *findOrMakeVar(struct htmlPage *page, char *name, 
00635         struct hash *hash, struct htmlTag *tag, struct htmlFormVar **pVarList)
00636 /* Find variable of existing name if it exists,  otherwise
00637  * make a new one and add to hash and list.  Add reference
00638  * to this tag to var. */
00639 {
00640 struct htmlFormVar *var = hashFindVal(hash, name);
00641 if (var == NULL)
00642     {
00643     AllocVar(var);
00644     var->name = name;
00645     var->tagName = tag->name;
00646     hashAdd(hash, name, var);
00647     slAddHead(pVarList, var);
00648     }
00649 else
00650     {
00651     if (!sameWord(var->tagName, tag->name))
00652         {
00653         tagWarn(page, tag, "Mixing FORM variable tag types %s and %s", 
00654                 var->tagName, tag->name);
00655         var->tagName = tag->name;
00656         }
00657     }
00658 refAdd(&var->tags, tag);
00659 return var;
00660 }
00661 
00662 static boolean isMixableInputType(char *type)
00663 /* Return TRUE if it's a type you can mix with others ok, like
00664  * button, submit, and image. */
00665 {
00666 return sameWord(type, "BUTTON") || sameWord(type, "SUBMIT") 
00667         || sameWord(type, "IMAGE");
00668 }
00669 
00670 static void htmlFormVarAddValue(struct htmlFormVar *var, char *value)
00671 /* Add value to list of predefined values for var. */
00672 {
00673 struct slName *name = slNameNew(value);
00674 slAddTail(&var->values, name);
00675 }
00676 
00677 
00678 static struct htmlFormVar *formParseVars(struct htmlPage *page, struct htmlForm *form)
00679 /* Return a list of variables parsed out of form.  
00680  * A form variable is something that may appear in the name
00681  * side of the name=value pairs that serves as input to a CGI
00682  * script.  The variables may be constructed from buttons, 
00683  * INPUT tags, OPTION lists, or TEXTAREAs. */
00684 {
00685 struct htmlTag *tag;
00686 struct htmlFormVar *varList = NULL, *var;
00687 struct hash *hash = newHash(0);
00688 for (tag = form->startTag->next; tag != form->endTag; tag = tag->next)
00689     {
00690     if (sameWord(tag->name, "INPUT"))
00691         {
00692         char *type = htmlTagAttributeVal(page, tag, "TYPE", NULL);
00693         char *varName = htmlTagAttributeVal(page, tag, "NAME", NULL);
00694         char *value = htmlTagAttributeVal(page, tag, "VALUE", NULL);
00695         if (type == NULL)
00696             type = "TEXT";
00697         if (varName == NULL)
00698             {
00699             if (!sameWord(type, "SUBMIT") && !sameWord(type, "CLEAR")
00700                 && !sameWord(type, "BUTTON") && !sameWord(type, "RESET")
00701                 && !sameWord(type, "IMAGE"))
00702                 tagWarn(page, tag, "Missing NAME attribute");
00703             varName = "n/a";
00704             }
00705         var = findOrMakeVar(page, varName, hash, tag, &varList); 
00706         if (var->type != NULL && !sameWord(var->type, type))
00707             {
00708             if (!isMixableInputType(var->type) || !isMixableInputType(type))
00709                 tagWarn(page, tag, "Mixing input types %s and %s", var->type, type);
00710             }
00711         var->type = type;
00712         if (sameWord(type, "TEXT") || sameWord(type, "PASSWORD") 
00713                 || sameWord(type, "FILE") || sameWord(type, "HIDDEN")
00714                 || sameWord(type, "IMAGE"))
00715             {
00716             var->curVal = cloneString(value);
00717             }
00718         else if (sameWord(type, "CHECKBOX"))
00719             {
00720             if (htmlTagAttributeVal(page, tag, "CHECKED", NULL) != NULL)
00721                 var->curVal = cloneString("on");
00722             }
00723         else if (sameWord(type, "RADIO"))
00724             {
00725             if (htmlTagAttributeVal(page, tag, "CHECKED", NULL) != NULL)
00726                 var->curVal = cloneString(value);
00727             htmlFormVarAddValue(var, value);
00728             }
00729         else if ( sameWord(type, "RESET") || sameWord(type, "BUTTON") ||
00730                 sameWord(type, "SUBMIT") || sameWord(type, "IMAGE") ||
00731                 sameWord(type, "n/a"))
00732             {
00733             /* Do nothing. */
00734             }
00735         else
00736             {
00737             tagWarn(page, tag, "Unrecognized INPUT TYPE %s", type);
00738             }
00739         }
00740     else if (sameWord(tag->name, "SELECT"))
00741         {
00742         char *varName = htmlTagAttributeNeeded(page, tag, "NAME");
00743         struct htmlTag *subTag;
00744         var = findOrMakeVar(page, varName, hash, tag, &varList); 
00745         for (subTag = tag->next; subTag != form->endTag; subTag = subTag->next)
00746             {
00747             if (sameWord(subTag->name, "/SELECT"))
00748                 {
00749                 if (var->curVal == NULL && var->values != NULL)
00750                     {
00751                     var->curVal = cloneString(var->values->name);
00752                     }
00753                 break;
00754                 }
00755             else if (sameWord(subTag->name, "OPTION"))
00756                 {
00757                 char *val = cloneString(htmlTagAttributeVal(page, subTag, "VALUE", NULL));
00758                 if (val == NULL)
00759                     {
00760                     char *e = strchr(subTag->end, '<');
00761                     if (e != NULL)
00762                         val = cloneStringZ(subTag->end, e - subTag->end);
00763                     }
00764                 if (val != NULL)
00765                     htmlFormVarAddValue(var, val);
00766                 if (htmlTagAttributeVal(page, subTag, "SELECTED", NULL) != NULL)
00767                     {
00768                     if (val != NULL)
00769                         var->curVal = cloneString(val);
00770                     }
00771                 freez(&val);
00772                 }
00773             }
00774         }
00775     else if (sameWord(tag->name, "TEXTAREA"))
00776         {
00777         char *varName = htmlTagAttributeNeeded(page, tag, "NAME");
00778         char *e = strchr(tag->end, '<');
00779         var = findOrMakeVar(page, varName, hash, tag, &varList); 
00780         if (e != NULL)
00781             var->curVal = cloneStringZ(tag->end, e - tag->end);
00782         }
00783     }
00784 freeHash(&hash);    
00785 slReverse(&varList);
00786 for (var = varList; var != NULL; var = var->next)
00787     {
00788     slReverse(&var->tags);
00789     }
00790 return varList;
00791 }
00792 
00793 static struct htmlForm *htmlParseForms(struct htmlPage *page,
00794         struct htmlTag *startTag, struct htmlTag *endTag)
00795 /* Parse out list of forms from tag stream. */
00796 {
00797 struct htmlForm *formList = NULL, *form = NULL;
00798 struct htmlTag *tag;
00799 for (tag = startTag; tag != endTag; tag = tag->next)
00800     {
00801     if (sameWord(tag->name, "FORM"))
00802         {
00803         if (form != NULL)
00804             tagWarn(page, tag, "FORM inside of FORM");
00805         AllocVar(form);
00806         form->startTag = tag;
00807         slAddHead(&formList, form);
00808         form->name = htmlTagAttributeVal(page, tag, "name", "n/a");
00809         form->action = htmlTagAttributeNeeded(page, tag, "action");
00810         form->method = htmlTagAttributeVal(page, tag, "method", "GET");
00811         }
00812     else if (sameWord(tag->name, "/FORM"))
00813         {
00814         if (form == NULL)
00815             tagWarn(page, tag, "/FORM outside of FORM");
00816         else
00817             {
00818             form->endTag = tag->next;
00819             form = NULL;
00820             }
00821         }
00822     }
00823 slReverse(&formList);
00824 for (form = formList; form != NULL; form = form->next)
00825     {
00826     form->vars = formParseVars(page, form);
00827     }
00828 return formList;
00829 }
00830 
00831 struct htmlPage *htmlPageParse(char *url, char *fullText)
00832 /* Parse out page and return. */
00833 {
00834 struct htmlPage *page;
00835 char *dupe = cloneLongString(fullText);
00836 char *s = dupe;
00837 struct htmlStatus *status = htmlStatusParse(&s);
00838 char *contentType;
00839 
00840 if (status == NULL)
00841     return NULL;
00842 
00843 AllocVar(page);
00844 page->url = cloneString(url);
00845 page->fullText = fullText;
00846 page->status = status;
00847 page->header = htmlHeaderRead(&s, &page->cookies);
00848 contentType = hashFindVal(page->header, "Content-Type:");
00849 if (contentType == NULL)        
00850     {
00851     warn("No contentType, assuming text/html");
00852     contentType = cloneString("text/html");
00853     hashAdd(page->header, "Content-Type:", contentType);
00854     }
00855 page->htmlText = fullText + (s - dupe);
00856 if (startsWith("text/html", contentType))
00857     {
00858     page->tags = htmlTagScan(page->htmlText, s);
00859     page->forms = htmlParseForms(page, page->tags, NULL);
00860     }
00861 freez(&dupe);
00862 return page;
00863 }
00864 
00865 struct htmlPage *htmlPageParseNoHead(char *url, char *htmlText)
00866 /* Parse out page in memory (past http header if any) and return. */
00867 {
00868 char *dupe = cloneString(htmlText);
00869 struct htmlPage *page;
00870 AllocVar(page);
00871 page->url = cloneString(url);
00872 page->fullText = page->htmlText = htmlText;
00873 page->tags = htmlTagScan(page->htmlText, dupe);
00874 page->forms = htmlParseForms(page, page->tags, NULL);
00875 freez(&dupe);
00876 return page;
00877 }
00878 
00879 struct htmlPage *htmlPageParseOk(char *url, char *fullText)
00880 /* Parse out page and return only if status ok. */
00881 {
00882 struct htmlPage *page = htmlPageParse(url, fullText);
00883 if (page == NULL)
00884    noWarnAbort();
00885 if (page->status->status != 200)
00886    errAbort("%s returned with status code %d", url, page->status->status);
00887 return page;
00888 }
00889 
00890 char *htmlSlurpWithCookies(char *url, struct htmlCookie *cookies)
00891 /* Send get message to url with cookies, and return full response as
00892  * a dyString.  This is not parsed or validated, and includes http
00893  * header lines.  Typically you'd pass this to htmlPageParse() to
00894  * get an actual page. */
00895 {
00896 struct dyString *dyHeader = dyStringNew(0);
00897 struct dyString *dyText;
00898 int sd;
00899 
00900 cookieOutput(dyHeader, cookies);
00901 dyStringAppend(dyHeader, "\r\n");
00902 sd = netOpenHttpExt(url, "GET", FALSE);
00903 write(sd, dyHeader->string, dyHeader->stringSize);
00904 dyText = netSlurpFile(sd);
00905 close(sd);
00906 dyStringFree(&dyHeader);
00907 return dyStringCannibalize(&dyText);
00908 }
00909 
00910 struct htmlPage *htmlPageGetWithCookies(char *url, struct htmlCookie *cookies)
00911 /* Get page from URL giving server the given cookies.   Note only the
00912  * name and value parts of the cookies need to be filled in. */
00913 {
00914 char *buf = htmlSlurpWithCookies(url, cookies);
00915 return htmlPageParse(url, buf);
00916 }
00917 
00918 struct htmlPage *htmlPageForwarded(char *url, struct htmlCookie *cookies)
00919 /* Get html page.  If it's just a forwarding link then get do the
00920  * forwarding.  Cookies is a possibly empty list of cookies with
00921  * name and value parts filled in. */
00922 {
00923 struct htmlPage *page = htmlPageGetWithCookies(url, cookies);
00924 int level, maxLevels = 7;
00925 for (level = 0; level < maxLevels; ++level)
00926     {
00927     struct htmlPage *newPage;
00928     char *newUrl = hashFindVal(page->header, "Location:");
00929     if (newUrl == NULL)
00930         break;
00931     newPage = htmlPageGetWithCookies(newUrl, cookies);
00932     htmlPageFree(&page);
00933     page = newPage;
00934     }
00935 return page;
00936 }
00937 
00938 struct htmlPage *htmlPageForwardedNoAbort(char *url, struct htmlCookie *cookies)
00939 /* Try and get an HTML page.  Print warning and return NULL if there's a problem. */
00940 {
00941 struct errCatch *errCatch = errCatchNew();
00942 struct htmlPage *page = NULL;
00943 if (errCatchStart(errCatch))
00944     page = htmlPageForwarded(url, cookies);
00945 errCatchEnd(errCatch);
00946 if (errCatch->gotError)
00947     warn(errCatch->message->string);
00948 errCatchFree(&errCatch);
00949 return page;
00950 }
00951 
00952 
00953 struct htmlPage *htmlPageGet(char *url)
00954 /* Get page from URL (may be a file). */
00955 {
00956 if (fileExists(url))
00957     {
00958     char *buf;
00959     readInGulp(url, &buf, NULL);
00960     return htmlPageParseNoHead(url, buf);
00961     }
00962 else
00963     return htmlPageGetWithCookies(url, NULL);
00964 }
00965 
00966 void htmlFormVarPrint(struct htmlFormVar *var, FILE *f, char *prefix)
00967 /* Print out variable to file, prepending prefix. */
00968 {
00969 struct slName *val;
00970 fprintf(f, "%s%s\t%s\t%s\t%s\n", prefix, var->name, var->tagName, 
00971         naForNull(var->type), 
00972         naForNull(var->curVal));
00973 for (val = var->values; val != NULL; val = val->next)
00974      fprintf(f, "%s\t%s\n", prefix, val->name);
00975 }
00976 
00977 void htmlFormPrint(struct htmlForm *form, FILE *f)
00978 /* Print out form structure. */
00979 {
00980 struct htmlFormVar *var;
00981 fprintf(f, "%s\t%s\t%s\n", form->name, form->method, form->action);
00982 for (var = form->vars; var != NULL; var = var->next)
00983     htmlFormVarPrint(var, f, "\t");
00984 }
00985 
00986 struct htmlForm *htmlFormGet(struct htmlPage *page, char *name)
00987 /* Get named form. */
00988 {
00989 struct htmlForm *form;
00990 for (form = page->forms; form != NULL; form = form->next)
00991     if (sameWord(form->name, name))
00992         break;
00993 return form;
00994 }
00995 
00996 struct htmlFormVar *htmlFormVarGet(struct htmlForm *form, char *name)
00997 /* Get named variable. */
00998 {
00999 struct htmlFormVar *var;
01000 if (form == NULL)
01001     errAbort("Null form passed to htmlFormVarGet");
01002 for (var = form->vars; var != NULL; var = var->next)
01003     if (sameWord(var->name, name))
01004         break;
01005 return var;
01006 }
01007 
01008 void htmlFormVarSet(struct htmlForm *form, char *name, char *val)
01009 /* Set variable to given value. Create it if it doesn't exist*/
01010 {
01011 struct htmlFormVar *var;
01012 if (form == NULL)
01013     errAbort("Null form passed to htmlFormVarSet");
01014 var = htmlFormVarGet(form, name);
01015 if (var == NULL)
01016     {
01017     AllocVar(var);
01018     var->type = "TEXT";
01019     var->tagName = "INPUT";
01020     var->name = name;
01021     slAddHead(&form->vars, var);
01022     }
01023 freez(&var->curVal);
01024 var->curVal = cloneString(val);
01025 }
01026 
01027 
01028 struct htmlFormVar *htmlPageGetVar(struct htmlPage *page, struct htmlForm *form, char *name)
01029 /* Get named variable.  If form is NULL, first form in page is used. */
01030 {
01031 if (form == NULL)
01032     form = page->forms;
01033 return htmlFormVarGet(form, name);
01034 }
01035 
01036 void htmlPageSetVar(struct htmlPage *page, struct htmlForm *form, char *name, char *val)
01037 /* Set variable to given value.  If form is NULL, first form in page is used. */
01038 {
01039 if (page == NULL)
01040     errAbort("Null page passed to htmlPageSetVar");
01041 if (form == NULL)
01042     form = page->forms;
01043 if (form == NULL)
01044     errAbort("Null form in htmlPageSetVar");
01045 htmlFormVarSet(form, name, val);
01046 }
01047 
01048 static void asciiEntityDecode(char *in, char *out, int inLength)
01049 /* Decode from SGML Character Entity &# format to normal. 
01050  * Out will be a little shorter than in typically, and
01051  * can be the same buffer. Only supports ASCII charset. */
01052 {
01053 char c;
01054 int i;
01055 char *e;
01056 for (i=0; i<inLength;++i)
01057     {
01058     c = *in++;
01059     if ((c == '&') && (*in == '#'))
01060         {
01061         in++;
01062         if ((e = strchr(in,';')) == NULL  || (e - in) > 5)
01063             { /* probably a badly formatted string, just recover and continue */
01064             *out++ = '&';
01065             *out++ = '#';
01066             }
01067         else
01068             {
01069             int code;
01070             if (sscanf(in, "%d", &code) != 1)
01071                 {
01072                 code = '?';
01073                 }
01074             if (code > 255) 
01075                 {
01076                 code = '?';
01077                 }
01078             in = e;
01079             in++;
01080             *out++ = code;
01081             }
01082         }
01083     else
01084         *out++ = c;
01085     }
01086 *out++ = 0;
01087 }
01088 
01089 
01090 char *htmlExpandUrl(char *base, char *url)
01091 /* Expand URL that is relative to base to stand on it's own. 
01092  * Return NULL if it's not http or https. */
01093 {
01094 struct dyString *dy = NULL;
01095 char *hostName, *pastHostName;
01096 
01097 /* some mailto: have SGML char encoding, e.g &#97; to hide from spambots */
01098 url = cloneString(url); /* Clone because asciiEntityDecode may modify it. */
01099 asciiEntityDecode(url, url, strlen(url));
01100 
01101 /* In easiest case URL is actually absolute and begins with
01102  * protocol.  Just return clone of url. */
01103 if (startsWith("http:", url) || startsWith("https:", url))
01104     return url;
01105 
01106 /* If it's got a colon, but no http or https, then it's some
01107  * protocol we don't understand, like a mailto.  Just return NULL. */
01108 if (strchr(url, ':') != NULL)
01109     {
01110     freez(&url);
01111     return NULL;
01112     }
01113 
01114 /* Figure out first character past host name. Load up
01115  * return string with protocol (if any) and host name. */
01116 dy = dyStringNew(256);
01117 if (startsWith("http:", base) || startsWith("https:", base))
01118     hostName = (strchr(base, ':') + 3);
01119 else
01120     hostName = base;
01121 pastHostName = strchr(hostName, '/');
01122 if (pastHostName == NULL)
01123     pastHostName = hostName + strlen(hostName);
01124 dyStringAppendN(dy, base, pastHostName - base);
01125 
01126 /* Add url to return string after host name. */
01127 if (startsWith("/", url))       /* New URL is absolute, just append to hostName */
01128     {
01129     dyStringAppend(dy, url);
01130     }
01131 else
01132     {
01133     char *curDir = pastHostName;
01134     char *endDir;
01135     if (curDir[0] == '/')
01136         curDir += 1;
01137     dyStringAppendC(dy, '/');
01138     endDir = strrchr(curDir, '/');
01139     if (endDir == NULL)
01140         endDir = curDir;
01141     if (startsWith("../", url))
01142         {
01143         char *dir = cloneStringZ(curDir, endDir-curDir);
01144         char *path = expandRelativePath(dir, url);
01145         if (path != NULL)
01146              {
01147              dyStringAppend(dy, path);
01148              }
01149         freez(&dir);
01150         freez(&path);
01151         }
01152     else
01153         {
01154         dyStringAppendN(dy, curDir, endDir-curDir);
01155         if (lastChar(dy->string) != '/')
01156             dyStringAppendC(dy, '/');
01157         dyStringAppend(dy, url);
01158         }
01159     }
01160 freez(&url);
01161 return dyStringCannibalize(&dy);
01162 }
01163 
01164 static void appendCgiVar(struct dyString *dy, char *name, char *value)
01165 /* Append cgiVar with cgi-encoded value to dy. */
01166 {
01167 char *enc = NULL;
01168 if (value == NULL)
01169     value = "";
01170 enc = cgiEncode(value);
01171 if (dy->stringSize != 0)
01172     dyStringAppendC(dy, '&');
01173 dyStringAppend(dy, name);
01174 dyStringAppendC(dy, '=');
01175 dyStringAppend(dy, enc);
01176 freez(&enc);
01177 }
01178 
01179 #define MIMEBUFSIZE 4096
01180 
01181 static void appendMimeVar(struct dyString *dy, char *name, char *value, char *varType, char *boundary)
01182 /* Append cgiVar with cgi-encoded value to dy. */
01183 {
01184 char *fileName = NULL;
01185 
01186 if (value == NULL)
01187     value = "";
01188 dyStringAppend(dy, "\r\n--");
01189 dyStringAppend(dy, boundary);
01190 dyStringAppend(dy, "\r\n");
01191 dyStringAppend(dy, "content-disposition: form-data; name=\"");
01192 dyStringAppend(dy, name);
01193 dyStringAppend(dy, "\"");
01194 
01195 if (varType && sameWord(varType, "FILE"))
01196     {
01197     fileName = strrchr(value,'/'); 
01198     if (fileName)
01199         ++fileName;
01200     else
01201         fileName = value;
01202     dyStringAppend(dy, "; filename=\"");
01203     dyStringAppend(dy, fileName);
01204     dyStringAppend(dy, "\"");
01205     }
01206 dyStringAppend(dy, "\r\n");
01207 dyStringAppend(dy, "\r\n");
01208 if (varType && sameWord(varType, "FILE") && !sameWord(value,""))
01209     {
01210     FILE *f = mustOpen(value, "r");
01211     char buf[MIMEBUFSIZE];
01212     int bytesRead = 0;
01213     do
01214         {
01215         bytesRead = fread(buf,1,MIMEBUFSIZE,f);
01216         if (bytesRead < 0)
01217             errnoAbort("error reading file to upload %s",value);
01218         dyStringAppendN(dy, buf, bytesRead);
01219         }
01220     while(bytesRead > 0);
01221     carefulClose(&f);
01222     }
01223 else    
01224     dyStringAppend(dy, value);
01225 }
01226 
01227 static void appendMimeTerminus(struct dyString *dy, char *boundary)
01228 /* Append MIME boundary terminator to dy. */
01229 {
01230 dyStringAppend(dy, "\r\n--");
01231 dyStringAppend(dy, boundary);
01232 dyStringAppend(dy, "--\r\n");
01233 }
01234 
01235 
01236 static int countOccurrences(char *needle, int nLen, char *haystack, int hLen)
01237 /* count # of occurrences of needle in haystack */
01238 {
01239 int count = 0;
01240 char *match=NULL;
01241 while((match=memMatch(needle, nLen, haystack, hLen)) != NULL)
01242     {
01243     ++count;
01244     hLen -= (match - haystack) + nLen;
01245     if (hLen < 1)
01246         break;
01247     haystack=match+nLen;
01248     }
01249 return count;
01250 }
01251 
01252 static boolean isMimeEncoded(struct htmlForm *form)
01253 /* determine if the form is using MIME encoding */
01254 {
01255 struct htmlAttribute *a;
01256 for(a = form->startTag->attributes;a;a = a->next)
01257     if (sameWord(a->name,"ENCTYPE") && sameWord(a->val,"multipart/form-data"))
01258         return TRUE;
01259 return FALSE;
01260 }
01261 
01262 char *htmlFormCgiVars(struct htmlPage *page, struct htmlForm *form, 
01263         char *buttonName, char *buttonVal, struct dyString *dyHeader)
01264 /* Return cgi vars in name=val format from use having pressed
01265  * submit button of given name and value. */
01266 {
01267 struct dyString *dy = newDyString(0);
01268 struct htmlFormVar *var;
01269 boolean isMime = isMimeEncoded(form);
01270 int mimeParts = 0;
01271 char boundary[256];
01272 
01273 while(TRUE)
01274     {
01275     if (isMime)
01276         {
01277         /* choose a new string for the boundary */
01278         /* Set initial seed */
01279         int i = 0;
01280         safef(boundary,sizeof(boundary),"%s", "---------");
01281         srand( (unsigned)time( NULL ) );
01282         for(i=strlen(boundary);i<41;++i)
01283             {
01284             int r = (int) 26 * (rand() / (RAND_MAX + 1.0));
01285             boundary[i] = r+'A';
01286             }
01287         boundary[i] = 0;
01288         }
01289 
01290     if (form == NULL)
01291         form = page->forms;
01292     if (buttonName != NULL && !isMime)
01293         appendCgiVar(dy, buttonName, buttonVal);
01294     for (var = form->vars; var != NULL; var = var->next)
01295         {
01296         if (sameWord(var->tagName, "SELECT") || 
01297             sameWord(var->tagName, "TEXTAREA") || 
01298             (var->type != NULL &&
01299             ((sameWord(var->type, "RADIO") || sameWord(var->type, "TEXTBOX")
01300             || sameWord(var->type, "PASSWORD") || sameWord(var->type, "HIDDEN")
01301             || sameWord(var->type, "TEXT") || sameWord(var->type, "FILE")))))
01302             {
01303             char *val = var->curVal;
01304             if (val == NULL)
01305                 val = "";
01306             if (isMime)
01307                 {
01308                 ++mimeParts;
01309                 appendMimeVar(dy, var->name, val, var->type, boundary);
01310                 }
01311             else            
01312                 appendCgiVar(dy, var->name, val);
01313             }
01314         else if (var->type != NULL && sameWord(var->type, "CHECKBOX"))
01315             {
01316             if (var->curVal != NULL)
01317                 {
01318                 if (isMime)         
01319                     {
01320                     ++mimeParts;
01321                     appendMimeVar(dy, var->name, var->curVal, var->type, boundary);
01322                     }
01323                 else        
01324                     appendCgiVar(dy, var->name, var->curVal);
01325                 }
01326             }
01327         else if (isMime && buttonName && sameWord(buttonName,var->name))
01328             {
01329             ++mimeParts;
01330             appendMimeVar(dy, buttonName, buttonVal, NULL, boundary);
01331             }
01332         }
01333     if (isMime) 
01334         {
01335         ++mimeParts;
01336         appendMimeTerminus(dy,boundary);
01337         if (countOccurrences(boundary,strlen(boundary),dy->string,dy->stringSize) != mimeParts)
01338             { /* boundary was found in input! # occurrences not as expected */
01339             dyStringClear(dy);
01340             continue;  /* if at first you don't succeed, try another boundary string */
01341             }
01342         dyStringPrintf(dyHeader, "Content-type: multipart/form-data, boundary=%s\r\n",boundary);
01343         if (isMime && verboseLevel() == 2)
01344             {
01345             mustWrite(stderr, dyHeader->string, dyHeader->stringSize);
01346             mustWrite(stderr, dy->string, dy->stringSize);
01347             }
01348         }
01349     break;
01350     }   
01351     
01352 return dyStringCannibalize(&dy);
01353 
01354 }
01355 
01356 struct htmlPage *htmlPageFromForm(struct htmlPage *origPage, struct htmlForm *form, 
01357         char *buttonName, char *buttonVal)
01358 /* Return a new htmlPage based on response to pressing indicated button
01359  * on indicated form in origPage. */
01360 {
01361 struct htmlPage *newPage = NULL;
01362 struct dyString *dyUrl = dyStringNew(0);
01363 struct dyString *dyHeader = dyStringNew(0);
01364 struct dyString *dyText = NULL;
01365 char *url = htmlExpandUrl(origPage->url, form->action);
01366 char *cgiVars = NULL;
01367 int contentLength = 0;
01368 int sd = -1;
01369 
01370 dyStringAppend(dyUrl, url);
01371 cookieOutput(dyHeader, origPage->cookies);
01372 if (sameWord(form->method, "GET"))
01373     {
01374     cgiVars = htmlFormCgiVars(origPage, form, buttonName, buttonVal, dyHeader);
01375     dyStringAppend(dyUrl, "?");
01376     dyStringAppend(dyUrl, cgiVars);
01377     verbose(3, "GET %s\n", dyUrl->string);
01378     sd = netOpenHttpExt(dyUrl->string, form->method, FALSE);
01379     dyStringAppend(dyHeader, "\r\n");
01380     write(sd, dyHeader->string, dyHeader->stringSize);
01381     }
01382 else if (sameWord(form->method, "POST"))
01383     {
01384     cgiVars = htmlFormCgiVars(origPage, form, buttonName, buttonVal, dyHeader);
01385     contentLength = strlen(cgiVars);
01386     verbose(3, "POST %s\n", dyUrl->string);
01387     sd = netOpenHttpExt(dyUrl->string, form->method, FALSE);
01388     dyStringPrintf(dyHeader, "Content-length: %d\r\n", contentLength);
01389     dyStringAppend(dyHeader, "\r\n");
01390     write(sd, dyHeader->string, dyHeader->stringSize);
01391     write(sd, cgiVars, contentLength);
01392     }
01393 dyText = netSlurpFile(sd);
01394 close(sd);
01395 newPage = htmlPageParse(url, dyStringCannibalize(&dyText));
01396 freez(&url);
01397 dyStringFree(&dyUrl);
01398 dyStringFree(&dyHeader);
01399 freez(&cgiVars);
01400 return newPage;
01401 }
01402 
01403 struct slName *htmlPageScanAttribute(struct htmlPage *page, 
01404         char *tagName, char *attribute)
01405 /* Scan page for values of particular attribute in particular tag.
01406  * if tag is NULL then scans in all tags. */
01407 {
01408 struct htmlTag *tag;
01409 struct htmlAttribute *att;
01410 struct slName *list = NULL, *el;
01411 
01412 for (tag = page->tags; tag != NULL; tag = tag->next)
01413     {
01414     if (tagName == NULL || sameWord(tagName, tag->name))
01415         {
01416         for (att = tag->attributes; att != NULL; att = att->next)
01417             {
01418             if (sameWord(attribute, att->name))
01419                 {
01420                 el = slNameNew(att->val);
01421                 slAddHead(&list, el);
01422                 }
01423             }
01424         }
01425     }
01426 slReverse(&list);
01427 return list;
01428 }
01429 
01430 struct slName *htmlPageLinks(struct htmlPage *page)
01431 /* Scan through tags list and pull out HREF attributes. */
01432 {
01433 return htmlPageScanAttribute(page, NULL, "HREF");
01434 }
01435 
01436 struct htmlTableRow
01437 /* Data on a row */
01438     {
01439     struct htmlTableRow *next;
01440     int tdCount;
01441     int inTd;
01442     };
01443 
01444 struct htmlTable 
01445 /* Data on a table. */
01446     {
01447     struct htmlTable *next;
01448     struct htmlTableRow *row;
01449     int rowCount;
01450     };
01451 
01452 static void validateTables(struct htmlPage *page, 
01453         struct htmlTag *startTag, struct htmlTag *endTag)
01454 /* Validate <TABLE><TR><TD> are all properly nested, and that there
01455  * are no empty rows. */
01456 {
01457 struct htmlTable *tableStack = NULL, *table;
01458 struct htmlTableRow *row;
01459 struct htmlTag *tag;
01460 
01461 for (tag = startTag; tag != endTag; tag = tag->next)
01462     {
01463     if (sameWord(tag->name, "TABLE"))
01464         {
01465         if (tableStack != NULL)
01466             {
01467             if (tableStack->row == NULL || !tableStack->row->inTd)
01468             tagAbort(page, tag, "TABLE inside of another table, but not inside of <TR><TD>\n");
01469             }
01470         AllocVar(table);
01471         slAddHead(&tableStack, table);
01472         }
01473     else if (sameWord(tag->name, "/TABLE"))
01474         {
01475         if ((table = tableStack) == NULL)
01476             tagAbort(page, tag, "Extra </TABLE> tag");
01477         if (table->rowCount == 0)
01478             tagAbort(page, tag, "<TABLE> with no <TR>'s");
01479         if (table->row != NULL)
01480             tagAbort(page, tag, "</TABLE> inside of a row");
01481         tableStack = table->next;
01482         freez(&table);
01483         }
01484     else if (sameWord(tag->name, "TR"))
01485         {
01486         if ((table = tableStack) == NULL)
01487             tagAbort(page, tag, "<TR> outside of TABLE");
01488         if (table->row != NULL)
01489             tagAbort(page, tag, "<TR>...<TR> with no </TR> in between");
01490         AllocVar(table->row);
01491         table->rowCount += 1;
01492         }
01493     else if (sameWord(tag->name, "/TR"))
01494         {
01495         if ((table = tableStack) == NULL)
01496             tagAbort(page, tag, "</TR> outside of TABLE");
01497         if (table->row == NULL)
01498             tagAbort(page, tag, "</TR> with no <TR>");
01499 #ifdef LEGAL_ACTUALLY
01500         if (table->row->inTd)
01501             {
01502             tagAbort(page, tag, "</TR> while <TD> is open");
01503             }
01504 #endif /* LEGAL_ACTUALLY */
01505         if (table->row->tdCount == 0)
01506             tagAbort(page, tag, "Empty row in <TABLE>");
01507         freez(&table->row);
01508         }
01509     else if (sameWord(tag->name, "TD") || sameWord(tag->name, "TH"))
01510         {
01511         if ((table = tableStack) == NULL)
01512             tagAbort(page, tag, "<%s> outside of <TABLE>", tag->name);
01513         if ((row = table->row) == NULL)
01514             tagAbort(page, tag, "<%s> outside of <TR>", tag->name);
01515 #ifdef LEGAL_ACTUALLY
01516         if (row->inTd)
01517             {
01518             tagAbort(page, tag, "<%s>...<%s> with no </%s> in between", 
01519                 tag->name, tag->name, tag->name);
01520             }
01521 #endif /* LEGAL_ACTUALLY */
01522         row->inTd = TRUE;
01523         row->tdCount += 1;
01524         }
01525     else if (sameWord(tag->name, "/TD") || sameWord(tag->name, "/TH"))
01526         {
01527         if ((table = tableStack) == NULL)
01528             tagAbort(page, tag, "<%s> outside of <TABLE>", tag->name);
01529         if ((row = table->row) == NULL)
01530             tagAbort(page, tag, "<%s> outside of <TR>", tag->name);
01531         if (!row->inTd)
01532             tagAbort(page, tag, "<%s> with no <%s>", tag->name, tag->name+1);
01533         row->inTd = FALSE;
01534         }
01535     }
01536 if (tableStack != NULL)
01537     tagAbort(page, tag, "Missing </TABLE>");
01538 }
01539 
01540 static void checkTagIsInside(struct htmlPage *page, char *outsiders, char *insiders,  
01541         struct htmlTag *startTag, struct htmlTag *endTag)
01542 /* Check that insiders are all bracketed by outsiders. */
01543 {
01544 char *outDupe = cloneString(outsiders);
01545 char *inDupe = cloneString(insiders);
01546 char *line, *word;
01547 int depth = 0;
01548 struct htmlTag *tag;
01549 struct hash *outOpen = newHash(8);
01550 struct hash *outClose = newHash(8);
01551 struct hash *inHash = newHash(8);
01552 char buf[256];
01553 
01554 /* Create hashes of all insiders */
01555 line = inDupe;
01556 while ((word = nextWord(&line)) != NULL)
01557     {
01558     touppers(word);
01559     hashAdd(inHash, word, NULL);
01560     }
01561 
01562 /* Create hash of open and close outsiders. */
01563 line = outDupe;
01564 while ((word = nextWord(&line)) != NULL)
01565     {
01566     touppers(word);
01567     hashAdd(outOpen, word, NULL);
01568     safef(buf, sizeof(buf), "/%s", word);
01569     hashAdd(outClose, buf, NULL);
01570     }
01571 
01572 /* Stream through tags making sure that insiders are
01573  * at least one deep inside of outsiders. */
01574 for (tag = startTag; tag != NULL; tag = tag->next)
01575     {
01576     char *type = tag->name;
01577     if (hashLookup(outOpen, type ))
01578         ++depth;
01579     else if (hashLookup(outClose, type))
01580         --depth;
01581     else if (hashLookup(inHash, type))
01582         {
01583         if (depth <= 0)
01584             tagAbort(page, tag, "%s outside of any of %s", type, outsiders);
01585         }
01586     }
01587 freeHash(&inHash);
01588 freeHash(&outOpen);
01589 freeHash(&outClose);
01590 freeMem(outDupe);
01591 freeMem(inDupe);
01592 }
01593 
01594 static void checkNest(struct htmlPage *page,
01595         char *type, struct htmlTag *startTag, struct htmlTag *endTag)
01596 /* Check that <type> and </type> tags are properly nested. */
01597 {
01598 struct htmlTag *tag;
01599 int depth = 0;
01600 char endType[256];
01601 safef(endType, sizeof(endType), "/%s", type);
01602 for (tag = startTag; tag != endTag; tag = tag->next)
01603     {
01604     if (sameWord(tag->name, type))
01605         ++depth;
01606     else if (sameWord(tag->name, endType))
01607         {
01608         --depth;
01609         if (depth < 0)
01610            tagAbort(page, tag, "<%s> without preceding <%s>", endType, type);
01611         }
01612     }
01613 if (depth != 0)
01614     errAbort("Missing <%s> tag", endType);
01615 }
01616 
01617 static void validateNestingTags(struct htmlPage *page,
01618         struct htmlTag *startTag, struct htmlTag *endTag,
01619         char *nesters[], int nesterCount)
01620 /* Validate many tags that do need to nest. */
01621 {
01622 int i;
01623 for (i=0; i<nesterCount; ++i)
01624     checkNest(page, nesters[i], startTag, endTag);
01625 }
01626 
01627 static char *bodyNesters[] = 
01628 /* Nesting tags that appear in body. */
01629 {
01630     "ADDRESS", "DIV", "H1", "H2", "H3", "H4", "H5", "H6",
01631     "ACRONYM", "BLOCKQUOTE", "CITE", "CODE", "DEL", "DFN"
01632     "DIR", "DL", "MENU", "OL", "UL", "CAPTION", "TABLE", 
01633     "A", "MAP", "OBJECT", "FORM"
01634 };
01635 
01636 static char *headNesters[] =
01637 /* Nesting tags that appear in header. */
01638 {
01639     "TITLE",
01640 };
01641 
01642 static struct htmlTag *validateBody(struct htmlPage *page, struct htmlTag *startTag)
01643 /* Go through tags from current position (just past <BODY>)
01644  * up to and including </BODY> and check some things. */
01645 {
01646 struct htmlTag *tag, *endTag = NULL;
01647 
01648 /* First search for end tag. */
01649 for (tag = startTag; tag != NULL; tag = tag->next)
01650     {
01651     if (sameWord(tag->name, "/BODY"))
01652         {
01653         endTag = tag;
01654         break;
01655         }
01656     }
01657 if (endTag == NULL)
01658     errAbort("Missing </BODY>");
01659 validateTables(page, startTag, endTag);
01660 checkTagIsInside(page, "DIR MENU OL UL", "LI", startTag, endTag);
01661 checkTagIsInside(page, "DL", "DD DT", startTag, endTag);
01662 checkTagIsInside(page, "COLGROUP TABLE", "COL", startTag, endTag);
01663 checkTagIsInside(page, "MAP", "AREA", startTag, endTag);
01664 checkTagIsInside(page, "FORM", 
01665         "INPUT BUTTON /BUTTON OPTION SELECT /SELECT TEXTAREA /TEXTAREA"
01666         "FIELDSET /FIELDSET"
01667         , 
01668         startTag, endTag);
01669 validateNestingTags(page, startTag, endTag, bodyNesters, ArraySize(bodyNesters));
01670 return endTag->next;
01671 }
01672 
01673 static char *urlOkChars()
01674 /* Return array character indexed array that has
01675  * 1 for characters that are ok in URLs and 0
01676  * elsewhere. */
01677 {
01678 char *okChars;
01679 int c;
01680 AllocArray(okChars, 256);
01681 for (c=0; c<256; ++c)
01682     if (isalnum(c))
01683         okChars[c] = 1;
01684 /* This list is a little more inclusive than W3's. */
01685 okChars['='] = 1;
01686 okChars['-'] = 1;
01687 okChars['/'] = 1;
01688 okChars['%'] = 1;
01689 okChars['.'] = 1;
01690 okChars[';'] = 1;
01691 okChars[':'] = 1;
01692 okChars['_'] = 1;
01693 okChars['&'] = 1;
01694 okChars['+'] = 1;
01695 return okChars;
01696 }
01697 
01698 static void validateCgiUrl(char *url)
01699 /* Make sure URL follows basic CGI encoding rules. */
01700 {
01701 if (startsWith("http:", url) || startsWith("https:", url))
01702     {
01703     static char *okChars = NULL;
01704     UBYTE c, *s;
01705     if (okChars == NULL)
01706         okChars = urlOkChars();
01707     url = strchr(url, '?');
01708     if (url != NULL)
01709         {
01710         s = (UBYTE*)url+1;
01711         while ((c = *s++) != 0)
01712             {
01713             if (!okChars[c])
01714                 {
01715                 errAbort("Character %c not allowed in URL %s", c, url);
01716                 }
01717             }
01718         }
01719     }
01720 }
01721 
01722 static void validateCgiUrls(struct htmlPage *page)
01723 /* Make sure URLs in page follow basic CGI encoding rules. */
01724 {
01725 struct htmlForm *form;
01726 struct slName *linkList = htmlPageLinks(page), *link;
01727 
01728 for (form = page->forms; form != NULL; form = form->next)
01729     validateCgiUrl(form->action);
01730 for (link = linkList; link != NULL; link = link->next)
01731     validateCgiUrl(link->name);
01732 slFreeList(&linkList);
01733 }
01734 
01735 static int countTagsOfType(struct htmlTag *tagList, char *type)
01736 /* Count number of tags of given type. */
01737 {
01738 struct htmlTag *tag;
01739 int count = 0;
01740 for (tag = tagList; tag != NULL; tag = tag->next)
01741     if (sameString(tag->name, type))
01742         ++count;
01743 return count;
01744 }
01745 
01746 static void checkExactlyOne(struct htmlTag *tagList, char *type)
01747 /* Check there is exactly one of tag in list. */
01748 {
01749 int count = countTagsOfType(tagList, type);
01750 if (count != 1)
01751     errAbort("Expecting exactly 1 <%s>, got %d", type, count);
01752 }
01753 
01754 
01755 void htmlPageFormOrAbort(struct htmlPage *page)
01756 /* Aborts if no FORM found */
01757 {
01758 if (page == NULL)
01759     errAbort("Can't validate NULL page");
01760 if (page->forms == NULL)
01761     errAbort("No form found");
01762 }
01763 
01764 void htmlPageValidateOrAbort(struct htmlPage *page)
01765 /* Do some basic validations.  Aborts if there is a problem. */
01766 {
01767 struct htmlTag *tag;
01768 boolean gotTitle = FALSE;
01769 char *contentType = NULL;
01770 
01771 if (page == NULL)
01772     errAbort("Can't validate NULL page");
01773 if (page->header != NULL)
01774     contentType = hashFindVal(page->header, "Content-Type:");
01775 if (contentType == NULL || startsWith("text/html", contentType))
01776     {
01777     /* To simplify things upper case all tag names. */
01778     for (tag = page->tags; tag != NULL; tag = tag->next)
01779         touppers(tag->name);
01780 
01781     checkExactlyOne(page->tags, "BODY");
01782 
01783     /* Validate header, and make a suggestion or two */
01784     if ((tag = page->tags) == NULL)
01785         errAbort("No tags");
01786     if (!sameWord(tag->name, "HTML"))
01787         errAbort("Doesn't start with <HTML> tag");
01788     tag = tag->next;
01789     if (tag == NULL || !sameWord(tag->name, "HEAD"))
01790         warn("<HEAD> tag does not follow <HTML> tag");
01791     else
01792         {
01793         for (;;)
01794             {
01795             tag = tag->next;
01796             if (tag == NULL)
01797                 errAbort("Missing </HEAD>");
01798             if (sameWord(tag->name, "TITLE"))
01799                 gotTitle = TRUE;
01800             if (sameWord(tag->name, "/HEAD"))
01801                 break;
01802             }
01803         if (!gotTitle)
01804             warn("No title in <HEAD>");
01805         validateNestingTags(page, page->tags, tag, headNesters, ArraySize(headNesters));
01806         tag = tag->next;
01807         }
01808     if (tag == NULL || !sameWord(tag->name, "BODY"))
01809         errAbort("<BODY> tag does not follow <HTML> tag");
01810     tag = validateBody(page, tag->next);
01811     if (tag == NULL || !sameWord(tag->name, "/HTML"))
01812         errAbort("Missing </HTML>");
01813     validateCgiUrls(page);
01814     }
01815 }
01816 

Generated on Tue Dec 25 18:39:31 2007 for blat by  doxygen 1.5.2