00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #include "common.h"
00014 #include "errabort.h"
00015 #include "errCatch.h"
00016 #include "memalloc.h"
00017 #include "linefile.h"
00018 #include "hash.h"
00019 #include "dystring.h"
00020 #include "cheapcgi.h"
00021 #include "obscure.h"
00022 #include "filePath.h"
00023 #include "net.h"
00024 #include "htmlPage.h"
00025
00026 static char const rcsid[] = "$Id: htmlPage.c,v 1.32 2006/07/29 00:17:28 galt Exp $";
00027
00028 void htmlStatusFree(struct htmlStatus **pStatus)
00029
00030 {
00031 struct htmlStatus *status = *pStatus;
00032 if (status != NULL)
00033 {
00034 freeMem(status->version);
00035 freez(pStatus);
00036 }
00037 }
00038
00039 void htmlStatusFreeList(struct htmlStatus **pList)
00040
00041 {
00042 struct htmlStatus *el, *next;
00043
00044 for (el = *pList; el != NULL; el = next)
00045 {
00046 next = el->next;
00047 htmlStatusFree(&el);
00048 }
00049 *pList = NULL;
00050 }
00051
00052 void htmlCookieFree(struct htmlCookie **pCookie)
00053
00054 {
00055 struct htmlCookie *cookie = *pCookie;
00056 if (cookie != NULL)
00057 {
00058 freeMem(cookie->name);
00059 freeMem(cookie->value);
00060 freeMem(cookie->domain);
00061 freeMem(cookie->path);
00062 freeMem(cookie->expires);
00063 freez(pCookie);
00064 }
00065 }
00066
00067 void htmlCookieFreeList(struct htmlCookie **pList)
00068
00069 {
00070 struct htmlCookie *el, *next;
00071
00072 for (el = *pList; el != NULL; el = next)
00073 {
00074 next = el->next;
00075 htmlCookieFree(&el);
00076 }
00077 *pList = NULL;
00078 }
00079
00080 struct htmlCookie *htmlCookieFileRead(char *fileName)
00081
00082
00083 {
00084 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00085 struct htmlCookie *list = NULL, *cookie;
00086 char *line, *word;
00087 while (lineFileNextReal(lf, &line))
00088 {
00089 word = nextWord(&line);
00090 line = skipLeadingSpaces(line);
00091 if (line == NULL)
00092 errAbort("Missing cookie value line %d of %s", lf->lineIx, lf->fileName);
00093 AllocVar(cookie);
00094 cookie->name = cloneString(word);
00095 cookie->value = cloneString(line);
00096 slAddHead(&list, cookie);
00097 }
00098 lineFileClose(&lf);
00099 slReverse(&list);
00100 return list;
00101 }
00102
00103 static void cookieOutput(struct dyString *dy, struct htmlCookie *cookieList)
00104
00105 {
00106 struct htmlCookie *cookie;
00107 if (cookieList != NULL)
00108 {
00109 dyStringAppend(dy, "Cookie:");
00110 for (cookie = cookieList; cookie != NULL; cookie = cookie->next)
00111 {
00112 if (cookie != cookieList)
00113 dyStringAppendC(dy, ';');
00114 dyStringAppendC(dy, ' ');
00115 dyStringAppend(dy, cookie->name);
00116 dyStringAppendC(dy, '=');
00117 dyStringAppend(dy, cookie->value);
00118 }
00119 dyStringAppend(dy, "\r\n");
00120 }
00121 }
00122
00123
00124 void htmlAttributeFree(struct htmlAttribute **pAttribute)
00125
00126 {
00127 struct htmlAttribute *att = *pAttribute;
00128 if (att != NULL)
00129 {
00130 freeMem(att->name);
00131 freeMem(att->val);
00132 freez(pAttribute);
00133 }
00134 }
00135
00136 void htmlAttributeFreeList(struct htmlAttribute **pList)
00137
00138 {
00139 struct htmlAttribute *el, *next;
00140
00141 for (el = *pList; el != NULL; el = next)
00142 {
00143 next = el->next;
00144 htmlAttributeFree(&el);
00145 }
00146 *pList = NULL;
00147 }
00148
00149 void htmlTagFree(struct htmlTag **pTag)
00150
00151 {
00152 struct htmlTag *tag = *pTag;
00153 if (tag != NULL)
00154 {
00155 htmlAttributeFreeList(&tag->attributes);
00156 freeMem(tag->name);
00157 freez(pTag);
00158 }
00159 }
00160
00161 void htmlTagFreeList(struct htmlTag **pList)
00162
00163 {
00164 struct htmlTag *el, *next;
00165
00166 for (el = *pList; el != NULL; el = next)
00167 {
00168 next = el->next;
00169 htmlTagFree(&el);
00170 }
00171 *pList = NULL;
00172 }
00173
00174 void htmlFormVarFree(struct htmlFormVar **pVar)
00175
00176 {
00177 struct htmlFormVar *var = *pVar;
00178 if (var != NULL)
00179 {
00180 freeMem(var->curVal);
00181 slFreeList(&var->values);
00182 slFreeList(&var->tags);
00183 freez(pVar);
00184 }
00185 }
00186
00187 void htmlFormVarFreeList(struct htmlFormVar **pList)
00188
00189 {
00190 struct htmlFormVar *el, *next;
00191
00192 for (el = *pList; el != NULL; el = next)
00193 {
00194 next = el->next;
00195 htmlFormVarFree(&el);
00196 }
00197 *pList = NULL;
00198 }
00199
00200
00201 void htmlFormFree(struct htmlForm **pForm)
00202
00203 {
00204 struct htmlForm *form = *pForm;
00205 if (form != NULL)
00206 {
00207 htmlFormVarFreeList(&form->vars);
00208 freez(pForm);
00209 }
00210 }
00211
00212 void htmlFormFreeList(struct htmlForm **pList)
00213
00214 {
00215 struct htmlForm *el, *next;
00216
00217 for (el = *pList; el != NULL; el = next)
00218 {
00219 next = el->next;
00220 htmlFormFree(&el);
00221 }
00222 *pList = NULL;
00223 }
00224
00225 void htmlPageFree(struct htmlPage **pPage)
00226
00227 {
00228 struct htmlPage *page = *pPage;
00229 if (page != NULL)
00230 {
00231 freez(&page->url);
00232 htmlStatusFree(&page->status);
00233 freeHashAndVals(&page->header);
00234 htmlCookieFreeList(&page->cookies);
00235 freez(&page->fullText);
00236 htmlTagFreeList(&page->tags);
00237 htmlFormFreeList(&page->forms);
00238 freez(pPage);
00239 }
00240 }
00241
00242 void htmlPageFreeList(struct htmlPage **pList)
00243
00244 {
00245 struct htmlPage *el, *next;
00246
00247 for (el = *pList; el != NULL; el = next)
00248 {
00249 next = el->next;
00250 htmlPageFree(&el);
00251 }
00252 *pList = NULL;
00253 }
00254
00255 static int findLineNumber(char *start, char *pos)
00256
00257 {
00258 char *s;
00259 int line = 1;
00260 for (s = start; s <= pos; ++s)
00261 {
00262 if (s[0] == '\n')
00263 ++line;
00264 }
00265 return line;
00266 }
00267
00268 static void tagVaWarn(struct htmlPage *page, struct htmlTag *tag, char *format,
00269 va_list args)
00270
00271 {
00272 char context[80];
00273 strncpy(context, tag->start, sizeof(context));
00274 context[sizeof(context)-1] = 0;
00275 warn("Error near line %d of %s:\n %s", findLineNumber(page->htmlText, tag->start),
00276 page->url, context);
00277 vaWarn(format, args);
00278 }
00279
00280 static void tagWarn(struct htmlPage *page, struct htmlTag *tag, char *format, ...)
00281
00282 {
00283 va_list args;
00284 va_start(args, format);
00285 tagVaWarn(page, tag, format, args);
00286 va_end(args);
00287 }
00288
00289 static void tagAbort(struct htmlPage *page, struct htmlTag *tag, char *format, ...)
00290
00291 {
00292 va_list args;
00293 va_start(args, format);
00294 tagVaWarn(page, tag, format, args);
00295 va_end(args);
00296 noWarnAbort();
00297 }
00298
00299 struct htmlStatus *htmlStatusParse(char **pText)
00300
00301
00302 {
00303 char *text = *pText;
00304 char *end = strchr(text, '\n');
00305 struct htmlStatus *status;
00306 if (end != NULL)
00307 *pText = end+1;
00308 else
00309 *pText = text + strlen(text);
00310 end = skipToSpaces(text);
00311 if (end == NULL)
00312 {
00313 warn("Short status line.");
00314 return NULL;
00315 }
00316 AllocVar(status);
00317 status->version = cloneStringZ(text, end-text);
00318 end = skipLeadingSpaces(end);
00319 if (!isdigit(end[0]))
00320 {
00321 warn("Not a number in status field");
00322 return NULL;
00323 }
00324 status->status = atoi(end);
00325 return status;
00326 }
00327
00328 char *htmlNextCrLfLine(char **pS)
00329
00330
00331
00332 {
00333 char *s = *pS, *e;
00334 if (s == NULL || s[0] == 0)
00335 return NULL;
00336 e = strchr(s, '\n');
00337 if (e == NULL)
00338 verbose(1, "End of file in header\n");
00339 else
00340 {
00341 *e = 0;
00342 if (e == s || e[-1] != '\r')
00343 verbose(1, "Missing <CR> in header line\n");
00344 else
00345 e[-1] = 0;
00346 e += 1;
00347 }
00348 *pS = e;
00349 return s;
00350 }
00351
00352 static void cookieParseNameValuePair(char *s, char **retName, char **retVal)
00353
00354 {
00355 char *val = strchr(s, '=');
00356 if (val == NULL)
00357 {
00358 val = s + strlen(s);
00359 }
00360 *val++ = 0;
00361 *retName = s;
00362 *retVal = val;
00363 }
00364
00365 static struct htmlCookie *parseCookie(char *s)
00366
00367 {
00368 char *e, *name, *val;
00369 struct htmlCookie *cookie;
00370
00371
00372 e = strchr(s, ';');
00373 if (e == NULL)
00374 {
00375 warn("Missing ';' in cookie");
00376 return NULL;
00377 }
00378 *e++ = 0;
00379
00380
00381 AllocVar(cookie);
00382 cookieParseNameValuePair(s, &name, &val);
00383 cookie->name = cloneString(name);
00384 cookie->value = cloneString(val);
00385
00386
00387 s = e;
00388 for (;;)
00389 {
00390
00391 s = skipLeadingSpaces(s);
00392 e = strchr(s, ';');
00393 if (e == NULL)
00394 break;
00395 *e++ = 0;
00396
00397
00398 cookieParseNameValuePair(s, &name, &val);
00399 if (sameString(name, "domain"))
00400 cookie->domain = cloneString(val);
00401 else if (sameString(name, "path"))
00402 cookie->path = cloneString(val);
00403 else if (sameString(name, "expires"))
00404 cookie->expires = cloneString(val);
00405 else if (sameString(name, "secure"))
00406 cookie->secure = TRUE;
00407
00408 s = e;
00409 }
00410 return cookie;
00411 }
00412
00413 static struct hash *htmlHeaderRead(char **pHtml, struct htmlCookie **pCookies)
00414
00415
00416 {
00417 struct hash *hash = hashNew(6);
00418 for (;;)
00419 {
00420 char *line = htmlNextCrLfLine(pHtml);
00421 char *word;
00422 if (line == NULL)
00423 {
00424 warn("End of file in header");
00425 break;
00426 }
00427 word = nextWord(&line);
00428 if (word == NULL)
00429 break;
00430 line = skipLeadingSpaces(line);
00431 hashAdd(hash, word, cloneString(line));
00432 if (sameString(word, "Set-Cookie:"))
00433 {
00434 struct htmlCookie *cookie = parseCookie(line);
00435 if (cookie != NULL)
00436 slAddTail(pCookies, cookie);
00437 }
00438 }
00439 return hash;
00440 }
00441
00442 static char *htmlAttributeFindVal(struct htmlAttribute *list, char *name)
00443
00444 {
00445 struct htmlAttribute *att;
00446 for (att = list; att != NULL; att = att->next)
00447 {
00448 if (sameWord(att->name, name))
00449 return att->val;
00450 }
00451 return NULL;
00452 }
00453
00454
00455 char *htmlTagAttributeVal(struct htmlPage *page, struct htmlTag *tag,
00456 char *name, char *defaultVal)
00457
00458 {
00459 char *val = htmlAttributeFindVal(tag->attributes, name);
00460 if (val == NULL)
00461 val = defaultVal;
00462 return val;
00463 }
00464
00465 char *htmlTagAttributeNeeded(struct htmlPage *page, struct htmlTag *tag, char *name)
00466
00467
00468 {
00469 char *val = htmlTagAttributeVal(page, tag, name, NULL);
00470 if (val == NULL)
00471 {
00472 tagWarn(page, tag, "Missing %s attribute", name);
00473 val = "n/a";
00474 }
00475 return val;
00476 }
00477
00478 static struct htmlTag *htmlTagScan(char *html, char *dupe)
00479
00480
00481
00482
00483 {
00484 char *s = dupe, c, *e, *tagName;
00485 struct htmlTag *tagList = NULL, *tag;
00486 struct htmlAttribute *att;
00487 int pos;
00488
00489 for (;;)
00490 {
00491 c = *s++;
00492 if (c == 0)
00493 break;
00494 if (c == '<')
00495 {
00496 if (*s == '!')
00497 {
00498 s += 1;
00499 if (s[0] == '-' && s[1] == '-')
00500 s = stringIn("-->", s);
00501 else
00502 s = strchr(s, '>');
00503 if (s == NULL)
00504 {
00505 warn("End of file in comment");
00506 break;
00507 }
00508 }
00509 else
00510 {
00511
00512 e = s;
00513 for (;;)
00514 {
00515 c = *e;
00516 if (c == '>' || c == 0 || isspace(c))
00517 break;
00518 e += 1;
00519 }
00520 if (c != 0)
00521 *e++ = 0;
00522 tagName = s;
00523 s = e;
00524
00525
00526 AllocVar(tag);
00527 tag->name = cloneString(tagName);
00528 slAddHead(&tagList, tag);
00529 pos = tagName - dupe - 1;
00530 tag->start = html+pos;
00531
00532
00533 if (c == '>' || c == 0)
00534 {
00535 tag->end = html + (e - dupe);
00536 continue;
00537 }
00538
00539
00540 for (;;)
00541 {
00542 char *name, *val;
00543 boolean gotEnd = FALSE;
00544
00545
00546 s = skipLeadingSpaces(s);
00547 if (s[0] == '>' || s[0] == 0)
00548 {
00549 tag->end = html + (s - dupe);
00550 if (s[0] == '>')
00551 tag->end += 1;
00552 break;
00553 }
00554
00555
00556 e = s;
00557 for (;;)
00558 {
00559 c = *e;
00560 if (c == '=')
00561 break;
00562 else if (c == '>')
00563 break;
00564 else if (c == 0)
00565 break;
00566 else if (isspace(c))
00567 break;
00568 e += 1;
00569 }
00570 if (c == 0)
00571 {
00572 warn("End of file in tag");
00573 break;
00574 }
00575 name = s;
00576 *e++ = 0;
00577 eraseTrailingSpaces(name);
00578 if (c == '>')
00579 {
00580 val = "";
00581 gotEnd = TRUE;
00582 tag->end = html + (e - dupe);
00583 }
00584 else if (isspace(c))
00585 {
00586 val = "";
00587 }
00588 else
00589 {
00590 val = e = skipLeadingSpaces(e);
00591 if (e[0] == '"')
00592 {
00593 if (!parseQuotedString(val, val, &e))
00594 break;
00595 }
00596 else
00597 {
00598 for (;;)
00599 {
00600 c = *e;
00601 if (c == '>')
00602 {
00603 gotEnd = TRUE;
00604 *e++ = 0;
00605 tag->end = html + (e - dupe);
00606 break;
00607 }
00608 else if (isspace(c))
00609 {
00610 *e++ = 0;
00611 break;
00612 }
00613 else if (c == 0)
00614 break;
00615 ++e;
00616 }
00617 }
00618 }
00619 AllocVar(att);
00620 att->name = cloneString(name);
00621 att->val = cloneString(val);
00622 slAddTail(&tag->attributes, att);
00623 s = e;
00624 if (gotEnd)
00625 break;
00626 }
00627 }
00628 }
00629 }
00630 slReverse(&tagList);
00631 return tagList;
00632 }
00633
00634 static struct htmlFormVar *findOrMakeVar(struct htmlPage *page, char *name,
00635 struct hash *hash, struct htmlTag *tag, struct htmlFormVar **pVarList)
00636
00637
00638
00639 {
00640 struct htmlFormVar *var = hashFindVal(hash, name);
00641 if (var == NULL)
00642 {
00643 AllocVar(var);
00644 var->name = name;
00645 var->tagName = tag->name;
00646 hashAdd(hash, name, var);
00647 slAddHead(pVarList, var);
00648 }
00649 else
00650 {
00651 if (!sameWord(var->tagName, tag->name))
00652 {
00653 tagWarn(page, tag, "Mixing FORM variable tag types %s and %s",
00654 var->tagName, tag->name);
00655 var->tagName = tag->name;
00656 }
00657 }
00658 refAdd(&var->tags, tag);
00659 return var;
00660 }
00661
00662 static boolean isMixableInputType(char *type)
00663
00664
00665 {
00666 return sameWord(type, "BUTTON") || sameWord(type, "SUBMIT")
00667 || sameWord(type, "IMAGE");
00668 }
00669
00670 static void htmlFormVarAddValue(struct htmlFormVar *var, char *value)
00671
00672 {
00673 struct slName *name = slNameNew(value);
00674 slAddTail(&var->values, name);
00675 }
00676
00677
00678 static struct htmlFormVar *formParseVars(struct htmlPage *page, struct htmlForm *form)
00679
00680
00681
00682
00683
00684 {
00685 struct htmlTag *tag;
00686 struct htmlFormVar *varList = NULL, *var;
00687 struct hash *hash = newHash(0);
00688 for (tag = form->startTag->next; tag != form->endTag; tag = tag->next)
00689 {
00690 if (sameWord(tag->name, "INPUT"))
00691 {
00692 char *type = htmlTagAttributeVal(page, tag, "TYPE", NULL);
00693 char *varName = htmlTagAttributeVal(page, tag, "NAME", NULL);
00694 char *value = htmlTagAttributeVal(page, tag, "VALUE", NULL);
00695 if (type == NULL)
00696 type = "TEXT";
00697 if (varName == NULL)
00698 {
00699 if (!sameWord(type, "SUBMIT") && !sameWord(type, "CLEAR")
00700 && !sameWord(type, "BUTTON") && !sameWord(type, "RESET")
00701 && !sameWord(type, "IMAGE"))
00702 tagWarn(page, tag, "Missing NAME attribute");
00703 varName = "n/a";
00704 }
00705 var = findOrMakeVar(page, varName, hash, tag, &varList);
00706 if (var->type != NULL && !sameWord(var->type, type))
00707 {
00708 if (!isMixableInputType(var->type) || !isMixableInputType(type))
00709 tagWarn(page, tag, "Mixing input types %s and %s", var->type, type);
00710 }
00711 var->type = type;
00712 if (sameWord(type, "TEXT") || sameWord(type, "PASSWORD")
00713 || sameWord(type, "FILE") || sameWord(type, "HIDDEN")
00714 || sameWord(type, "IMAGE"))
00715 {
00716 var->curVal = cloneString(value);
00717 }
00718 else if (sameWord(type, "CHECKBOX"))
00719 {
00720 if (htmlTagAttributeVal(page, tag, "CHECKED", NULL) != NULL)
00721 var->curVal = cloneString("on");
00722 }
00723 else if (sameWord(type, "RADIO"))
00724 {
00725 if (htmlTagAttributeVal(page, tag, "CHECKED", NULL) != NULL)
00726 var->curVal = cloneString(value);
00727 htmlFormVarAddValue(var, value);
00728 }
00729 else if ( sameWord(type, "RESET") || sameWord(type, "BUTTON") ||
00730 sameWord(type, "SUBMIT") || sameWord(type, "IMAGE") ||
00731 sameWord(type, "n/a"))
00732 {
00733
00734 }
00735 else
00736 {
00737 tagWarn(page, tag, "Unrecognized INPUT TYPE %s", type);
00738 }
00739 }
00740 else if (sameWord(tag->name, "SELECT"))
00741 {
00742 char *varName = htmlTagAttributeNeeded(page, tag, "NAME");
00743 struct htmlTag *subTag;
00744 var = findOrMakeVar(page, varName, hash, tag, &varList);
00745 for (subTag = tag->next; subTag != form->endTag; subTag = subTag->next)
00746 {
00747 if (sameWord(subTag->name, "/SELECT"))
00748 {
00749 if (var->curVal == NULL && var->values != NULL)
00750 {
00751 var->curVal = cloneString(var->values->name);
00752 }
00753 break;
00754 }
00755 else if (sameWord(subTag->name, "OPTION"))
00756 {
00757 char *val = cloneString(htmlTagAttributeVal(page, subTag, "VALUE", NULL));
00758 if (val == NULL)
00759 {
00760 char *e = strchr(subTag->end, '<');
00761 if (e != NULL)
00762 val = cloneStringZ(subTag->end, e - subTag->end);
00763 }
00764 if (val != NULL)
00765 htmlFormVarAddValue(var, val);
00766 if (htmlTagAttributeVal(page, subTag, "SELECTED", NULL) != NULL)
00767 {
00768 if (val != NULL)
00769 var->curVal = cloneString(val);
00770 }
00771 freez(&val);
00772 }
00773 }
00774 }
00775 else if (sameWord(tag->name, "TEXTAREA"))
00776 {
00777 char *varName = htmlTagAttributeNeeded(page, tag, "NAME");
00778 char *e = strchr(tag->end, '<');
00779 var = findOrMakeVar(page, varName, hash, tag, &varList);
00780 if (e != NULL)
00781 var->curVal = cloneStringZ(tag->end, e - tag->end);
00782 }
00783 }
00784 freeHash(&hash);
00785 slReverse(&varList);
00786 for (var = varList; var != NULL; var = var->next)
00787 {
00788 slReverse(&var->tags);
00789 }
00790 return varList;
00791 }
00792
00793 static struct htmlForm *htmlParseForms(struct htmlPage *page,
00794 struct htmlTag *startTag, struct htmlTag *endTag)
00795
00796 {
00797 struct htmlForm *formList = NULL, *form = NULL;
00798 struct htmlTag *tag;
00799 for (tag = startTag; tag != endTag; tag = tag->next)
00800 {
00801 if (sameWord(tag->name, "FORM"))
00802 {
00803 if (form != NULL)
00804 tagWarn(page, tag, "FORM inside of FORM");
00805 AllocVar(form);
00806 form->startTag = tag;
00807 slAddHead(&formList, form);
00808 form->name = htmlTagAttributeVal(page, tag, "name", "n/a");
00809 form->action = htmlTagAttributeNeeded(page, tag, "action");
00810 form->method = htmlTagAttributeVal(page, tag, "method", "GET");
00811 }
00812 else if (sameWord(tag->name, "/FORM"))
00813 {
00814 if (form == NULL)
00815 tagWarn(page, tag, "/FORM outside of FORM");
00816 else
00817 {
00818 form->endTag = tag->next;
00819 form = NULL;
00820 }
00821 }
00822 }
00823 slReverse(&formList);
00824 for (form = formList; form != NULL; form = form->next)
00825 {
00826 form->vars = formParseVars(page, form);
00827 }
00828 return formList;
00829 }
00830
00831 struct htmlPage *htmlPageParse(char *url, char *fullText)
00832
00833 {
00834 struct htmlPage *page;
00835 char *dupe = cloneLongString(fullText);
00836 char *s = dupe;
00837 struct htmlStatus *status = htmlStatusParse(&s);
00838 char *contentType;
00839
00840 if (status == NULL)
00841 return NULL;
00842
00843 AllocVar(page);
00844 page->url = cloneString(url);
00845 page->fullText = fullText;
00846 page->status = status;
00847 page->header = htmlHeaderRead(&s, &page->cookies);
00848 contentType = hashFindVal(page->header, "Content-Type:");
00849 if (contentType == NULL)
00850 {
00851 warn("No contentType, assuming text/html");
00852 contentType = cloneString("text/html");
00853 hashAdd(page->header, "Content-Type:", contentType);
00854 }
00855 page->htmlText = fullText + (s - dupe);
00856 if (startsWith("text/html", contentType))
00857 {
00858 page->tags = htmlTagScan(page->htmlText, s);
00859 page->forms = htmlParseForms(page, page->tags, NULL);
00860 }
00861 freez(&dupe);
00862 return page;
00863 }
00864
00865 struct htmlPage *htmlPageParseNoHead(char *url, char *htmlText)
00866
00867 {
00868 char *dupe = cloneString(htmlText);
00869 struct htmlPage *page;
00870 AllocVar(page);
00871 page->url = cloneString(url);
00872 page->fullText = page->htmlText = htmlText;
00873 page->tags = htmlTagScan(page->htmlText, dupe);
00874 page->forms = htmlParseForms(page, page->tags, NULL);
00875 freez(&dupe);
00876 return page;
00877 }
00878
00879 struct htmlPage *htmlPageParseOk(char *url, char *fullText)
00880
00881 {
00882 struct htmlPage *page = htmlPageParse(url, fullText);
00883 if (page == NULL)
00884 noWarnAbort();
00885 if (page->status->status != 200)
00886 errAbort("%s returned with status code %d", url, page->status->status);
00887 return page;
00888 }
00889
00890 char *htmlSlurpWithCookies(char *url, struct htmlCookie *cookies)
00891
00892
00893
00894
00895 {
00896 struct dyString *dyHeader = dyStringNew(0);
00897 struct dyString *dyText;
00898 int sd;
00899
00900 cookieOutput(dyHeader, cookies);
00901 dyStringAppend(dyHeader, "\r\n");
00902 sd = netOpenHttpExt(url, "GET", FALSE);
00903 write(sd, dyHeader->string, dyHeader->stringSize);
00904 dyText = netSlurpFile(sd);
00905 close(sd);
00906 dyStringFree(&dyHeader);
00907 return dyStringCannibalize(&dyText);
00908 }
00909
00910 struct htmlPage *htmlPageGetWithCookies(char *url, struct htmlCookie *cookies)
00911
00912
00913 {
00914 char *buf = htmlSlurpWithCookies(url, cookies);
00915 return htmlPageParse(url, buf);
00916 }
00917
00918 struct htmlPage *htmlPageForwarded(char *url, struct htmlCookie *cookies)
00919
00920
00921
00922 {
00923 struct htmlPage *page = htmlPageGetWithCookies(url, cookies);
00924 int level, maxLevels = 7;
00925 for (level = 0; level < maxLevels; ++level)
00926 {
00927 struct htmlPage *newPage;
00928 char *newUrl = hashFindVal(page->header, "Location:");
00929 if (newUrl == NULL)
00930 break;
00931 newPage = htmlPageGetWithCookies(newUrl, cookies);
00932 htmlPageFree(&page);
00933 page = newPage;
00934 }
00935 return page;
00936 }
00937
00938 struct htmlPage *htmlPageForwardedNoAbort(char *url, struct htmlCookie *cookies)
00939
00940 {
00941 struct errCatch *errCatch = errCatchNew();
00942 struct htmlPage *page = NULL;
00943 if (errCatchStart(errCatch))
00944 page = htmlPageForwarded(url, cookies);
00945 errCatchEnd(errCatch);
00946 if (errCatch->gotError)
00947 warn(errCatch->message->string);
00948 errCatchFree(&errCatch);
00949 return page;
00950 }
00951
00952
00953 struct htmlPage *htmlPageGet(char *url)
00954
00955 {
00956 if (fileExists(url))
00957 {
00958 char *buf;
00959 readInGulp(url, &buf, NULL);
00960 return htmlPageParseNoHead(url, buf);
00961 }
00962 else
00963 return htmlPageGetWithCookies(url, NULL);
00964 }
00965
00966 void htmlFormVarPrint(struct htmlFormVar *var, FILE *f, char *prefix)
00967
00968 {
00969 struct slName *val;
00970 fprintf(f, "%s%s\t%s\t%s\t%s\n", prefix, var->name, var->tagName,
00971 naForNull(var->type),
00972 naForNull(var->curVal));
00973 for (val = var->values; val != NULL; val = val->next)
00974 fprintf(f, "%s\t%s\n", prefix, val->name);
00975 }
00976
00977 void htmlFormPrint(struct htmlForm *form, FILE *f)
00978
00979 {
00980 struct htmlFormVar *var;
00981 fprintf(f, "%s\t%s\t%s\n", form->name, form->method, form->action);
00982 for (var = form->vars; var != NULL; var = var->next)
00983 htmlFormVarPrint(var, f, "\t");
00984 }
00985
00986 struct htmlForm *htmlFormGet(struct htmlPage *page, char *name)
00987
00988 {
00989 struct htmlForm *form;
00990 for (form = page->forms; form != NULL; form = form->next)
00991 if (sameWord(form->name, name))
00992 break;
00993 return form;
00994 }
00995
00996 struct htmlFormVar *htmlFormVarGet(struct htmlForm *form, char *name)
00997
00998 {
00999 struct htmlFormVar *var;
01000 if (form == NULL)
01001 errAbort("Null form passed to htmlFormVarGet");
01002 for (var = form->vars; var != NULL; var = var->next)
01003 if (sameWord(var->name, name))
01004 break;
01005 return var;
01006 }
01007
01008 void htmlFormVarSet(struct htmlForm *form, char *name, char *val)
01009
01010 {
01011 struct htmlFormVar *var;
01012 if (form == NULL)
01013 errAbort("Null form passed to htmlFormVarSet");
01014 var = htmlFormVarGet(form, name);
01015 if (var == NULL)
01016 {
01017 AllocVar(var);
01018 var->type = "TEXT";
01019 var->tagName = "INPUT";
01020 var->name = name;
01021 slAddHead(&form->vars, var);
01022 }
01023 freez(&var->curVal);
01024 var->curVal = cloneString(val);
01025 }
01026
01027
01028 struct htmlFormVar *htmlPageGetVar(struct htmlPage *page, struct htmlForm *form, char *name)
01029
01030 {
01031 if (form == NULL)
01032 form = page->forms;
01033 return htmlFormVarGet(form, name);
01034 }
01035
01036 void htmlPageSetVar(struct htmlPage *page, struct htmlForm *form, char *name, char *val)
01037
01038 {
01039 if (page == NULL)
01040 errAbort("Null page passed to htmlPageSetVar");
01041 if (form == NULL)
01042 form = page->forms;
01043 if (form == NULL)
01044 errAbort("Null form in htmlPageSetVar");
01045 htmlFormVarSet(form, name, val);
01046 }
01047
01048 static void asciiEntityDecode(char *in, char *out, int inLength)
01049
01050
01051
01052 {
01053 char c;
01054 int i;
01055 char *e;
01056 for (i=0; i<inLength;++i)
01057 {
01058 c = *in++;
01059 if ((c == '&') && (*in == '#'))
01060 {
01061 in++;
01062 if ((e = strchr(in,';')) == NULL || (e - in) > 5)
01063 {
01064 *out++ = '&';
01065 *out++ = '#';
01066 }
01067 else
01068 {
01069 int code;
01070 if (sscanf(in, "%d", &code) != 1)
01071 {
01072 code = '?';
01073 }
01074 if (code > 255)
01075 {
01076 code = '?';
01077 }
01078 in = e;
01079 in++;
01080 *out++ = code;
01081 }
01082 }
01083 else
01084 *out++ = c;
01085 }
01086 *out++ = 0;
01087 }
01088
01089
01090 char *htmlExpandUrl(char *base, char *url)
01091
01092
01093 {
01094 struct dyString *dy = NULL;
01095 char *hostName, *pastHostName;
01096
01097
01098 url = cloneString(url);
01099 asciiEntityDecode(url, url, strlen(url));
01100
01101
01102
01103 if (startsWith("http:", url) || startsWith("https:", url))
01104 return url;
01105
01106
01107
01108 if (strchr(url, ':') != NULL)
01109 {
01110 freez(&url);
01111 return NULL;
01112 }
01113
01114
01115
01116 dy = dyStringNew(256);
01117 if (startsWith("http:", base) || startsWith("https:", base))
01118 hostName = (strchr(base, ':') + 3);
01119 else
01120 hostName = base;
01121 pastHostName = strchr(hostName, '/');
01122 if (pastHostName == NULL)
01123 pastHostName = hostName + strlen(hostName);
01124 dyStringAppendN(dy, base, pastHostName - base);
01125
01126
01127 if (startsWith("/", url))
01128 {
01129 dyStringAppend(dy, url);
01130 }
01131 else
01132 {
01133 char *curDir = pastHostName;
01134 char *endDir;
01135 if (curDir[0] == '/')
01136 curDir += 1;
01137 dyStringAppendC(dy, '/');
01138 endDir = strrchr(curDir, '/');
01139 if (endDir == NULL)
01140 endDir = curDir;
01141 if (startsWith("../", url))
01142 {
01143 char *dir = cloneStringZ(curDir, endDir-curDir);
01144 char *path = expandRelativePath(dir, url);
01145 if (path != NULL)
01146 {
01147 dyStringAppend(dy, path);
01148 }
01149 freez(&dir);
01150 freez(&path);
01151 }
01152 else
01153 {
01154 dyStringAppendN(dy, curDir, endDir-curDir);
01155 if (lastChar(dy->string) != '/')
01156 dyStringAppendC(dy, '/');
01157 dyStringAppend(dy, url);
01158 }
01159 }
01160 freez(&url);
01161 return dyStringCannibalize(&dy);
01162 }
01163
01164 static void appendCgiVar(struct dyString *dy, char *name, char *value)
01165
01166 {
01167 char *enc = NULL;
01168 if (value == NULL)
01169 value = "";
01170 enc = cgiEncode(value);
01171 if (dy->stringSize != 0)
01172 dyStringAppendC(dy, '&');
01173 dyStringAppend(dy, name);
01174 dyStringAppendC(dy, '=');
01175 dyStringAppend(dy, enc);
01176 freez(&enc);
01177 }
01178
01179 #define MIMEBUFSIZE 4096
01180
01181 static void appendMimeVar(struct dyString *dy, char *name, char *value, char *varType, char *boundary)
01182
01183 {
01184 char *fileName = NULL;
01185
01186 if (value == NULL)
01187 value = "";
01188 dyStringAppend(dy, "\r\n--");
01189 dyStringAppend(dy, boundary);
01190 dyStringAppend(dy, "\r\n");
01191 dyStringAppend(dy, "content-disposition: form-data; name=\"");
01192 dyStringAppend(dy, name);
01193 dyStringAppend(dy, "\"");
01194
01195 if (varType && sameWord(varType, "FILE"))
01196 {
01197 fileName = strrchr(value,'/');
01198 if (fileName)
01199 ++fileName;
01200 else
01201 fileName = value;
01202 dyStringAppend(dy, "; filename=\"");
01203 dyStringAppend(dy, fileName);
01204 dyStringAppend(dy, "\"");
01205 }
01206 dyStringAppend(dy, "\r\n");
01207 dyStringAppend(dy, "\r\n");
01208 if (varType && sameWord(varType, "FILE") && !sameWord(value,""))
01209 {
01210 FILE *f = mustOpen(value, "r");
01211 char buf[MIMEBUFSIZE];
01212 int bytesRead = 0;
01213 do
01214 {
01215 bytesRead = fread(buf,1,MIMEBUFSIZE,f);
01216 if (bytesRead < 0)
01217 errnoAbort("error reading file to upload %s",value);
01218 dyStringAppendN(dy, buf, bytesRead);
01219 }
01220 while(bytesRead > 0);
01221 carefulClose(&f);
01222 }
01223 else
01224 dyStringAppend(dy, value);
01225 }
01226
01227 static void appendMimeTerminus(struct dyString *dy, char *boundary)
01228
01229 {
01230 dyStringAppend(dy, "\r\n--");
01231 dyStringAppend(dy, boundary);
01232 dyStringAppend(dy, "--\r\n");
01233 }
01234
01235
01236 static int countOccurrences(char *needle, int nLen, char *haystack, int hLen)
01237
01238 {
01239 int count = 0;
01240 char *match=NULL;
01241 while((match=memMatch(needle, nLen, haystack, hLen)) != NULL)
01242 {
01243 ++count;
01244 hLen -= (match - haystack) + nLen;
01245 if (hLen < 1)
01246 break;
01247 haystack=match+nLen;
01248 }
01249 return count;
01250 }
01251
01252 static boolean isMimeEncoded(struct htmlForm *form)
01253
01254 {
01255 struct htmlAttribute *a;
01256 for(a = form->startTag->attributes;a;a = a->next)
01257 if (sameWord(a->name,"ENCTYPE") && sameWord(a->val,"multipart/form-data"))
01258 return TRUE;
01259 return FALSE;
01260 }
01261
01262 char *htmlFormCgiVars(struct htmlPage *page, struct htmlForm *form,
01263 char *buttonName, char *buttonVal, struct dyString *dyHeader)
01264
01265
01266 {
01267 struct dyString *dy = newDyString(0);
01268 struct htmlFormVar *var;
01269 boolean isMime = isMimeEncoded(form);
01270 int mimeParts = 0;
01271 char boundary[256];
01272
01273 while(TRUE)
01274 {
01275 if (isMime)
01276 {
01277
01278
01279 int i = 0;
01280 safef(boundary,sizeof(boundary),"%s", "---------");
01281 srand( (unsigned)time( NULL ) );
01282 for(i=strlen(boundary);i<41;++i)
01283 {
01284 int r = (int) 26 * (rand() / (RAND_MAX + 1.0));
01285 boundary[i] = r+'A';
01286 }
01287 boundary[i] = 0;
01288 }
01289
01290 if (form == NULL)
01291 form = page->forms;
01292 if (buttonName != NULL && !isMime)
01293 appendCgiVar(dy, buttonName, buttonVal);
01294 for (var = form->vars; var != NULL; var = var->next)
01295 {
01296 if (sameWord(var->tagName, "SELECT") ||
01297 sameWord(var->tagName, "TEXTAREA") ||
01298 (var->type != NULL &&
01299 ((sameWord(var->type, "RADIO") || sameWord(var->type, "TEXTBOX")
01300 || sameWord(var->type, "PASSWORD") || sameWord(var->type, "HIDDEN")
01301 || sameWord(var->type, "TEXT") || sameWord(var->type, "FILE")))))
01302 {
01303 char *val = var->curVal;
01304 if (val == NULL)
01305 val = "";
01306 if (isMime)
01307 {
01308 ++mimeParts;
01309 appendMimeVar(dy, var->name, val, var->type, boundary);
01310 }
01311 else
01312 appendCgiVar(dy, var->name, val);
01313 }
01314 else if (var->type != NULL && sameWord(var->type, "CHECKBOX"))
01315 {
01316 if (var->curVal != NULL)
01317 {
01318 if (isMime)
01319 {
01320 ++mimeParts;
01321 appendMimeVar(dy, var->name, var->curVal, var->type, boundary);
01322 }
01323 else
01324 appendCgiVar(dy, var->name, var->curVal);
01325 }
01326 }
01327 else if (isMime && buttonName && sameWord(buttonName,var->name))
01328 {
01329 ++mimeParts;
01330 appendMimeVar(dy, buttonName, buttonVal, NULL, boundary);
01331 }
01332 }
01333 if (isMime)
01334 {
01335 ++mimeParts;
01336 appendMimeTerminus(dy,boundary);
01337 if (countOccurrences(boundary,strlen(boundary),dy->string,dy->stringSize) != mimeParts)
01338 {
01339 dyStringClear(dy);
01340 continue;
01341 }
01342 dyStringPrintf(dyHeader, "Content-type: multipart/form-data, boundary=%s\r\n",boundary);
01343 if (isMime && verboseLevel() == 2)
01344 {
01345 mustWrite(stderr, dyHeader->string, dyHeader->stringSize);
01346 mustWrite(stderr, dy->string, dy->stringSize);
01347 }
01348 }
01349 break;
01350 }
01351
01352 return dyStringCannibalize(&dy);
01353
01354 }
01355
01356 struct htmlPage *htmlPageFromForm(struct htmlPage *origPage, struct htmlForm *form,
01357 char *buttonName, char *buttonVal)
01358
01359
01360 {
01361 struct htmlPage *newPage = NULL;
01362 struct dyString *dyUrl = dyStringNew(0);
01363 struct dyString *dyHeader = dyStringNew(0);
01364 struct dyString *dyText = NULL;
01365 char *url = htmlExpandUrl(origPage->url, form->action);
01366 char *cgiVars = NULL;
01367 int contentLength = 0;
01368 int sd = -1;
01369
01370 dyStringAppend(dyUrl, url);
01371 cookieOutput(dyHeader, origPage->cookies);
01372 if (sameWord(form->method, "GET"))
01373 {
01374 cgiVars = htmlFormCgiVars(origPage, form, buttonName, buttonVal, dyHeader);
01375 dyStringAppend(dyUrl, "?");
01376 dyStringAppend(dyUrl, cgiVars);
01377 verbose(3, "GET %s\n", dyUrl->string);
01378 sd = netOpenHttpExt(dyUrl->string, form->method, FALSE);
01379 dyStringAppend(dyHeader, "\r\n");
01380 write(sd, dyHeader->string, dyHeader->stringSize);
01381 }
01382 else if (sameWord(form->method, "POST"))
01383 {
01384 cgiVars = htmlFormCgiVars(origPage, form, buttonName, buttonVal, dyHeader);
01385 contentLength = strlen(cgiVars);
01386 verbose(3, "POST %s\n", dyUrl->string);
01387 sd = netOpenHttpExt(dyUrl->string, form->method, FALSE);
01388 dyStringPrintf(dyHeader, "Content-length: %d\r\n", contentLength);
01389 dyStringAppend(dyHeader, "\r\n");
01390 write(sd, dyHeader->string, dyHeader->stringSize);
01391 write(sd, cgiVars, contentLength);
01392 }
01393 dyText = netSlurpFile(sd);
01394 close(sd);
01395 newPage = htmlPageParse(url, dyStringCannibalize(&dyText));
01396 freez(&url);
01397 dyStringFree(&dyUrl);
01398 dyStringFree(&dyHeader);
01399 freez(&cgiVars);
01400 return newPage;
01401 }
01402
01403 struct slName *htmlPageScanAttribute(struct htmlPage *page,
01404 char *tagName, char *attribute)
01405
01406
01407 {
01408 struct htmlTag *tag;
01409 struct htmlAttribute *att;
01410 struct slName *list = NULL, *el;
01411
01412 for (tag = page->tags; tag != NULL; tag = tag->next)
01413 {
01414 if (tagName == NULL || sameWord(tagName, tag->name))
01415 {
01416 for (att = tag->attributes; att != NULL; att = att->next)
01417 {
01418 if (sameWord(attribute, att->name))
01419 {
01420 el = slNameNew(att->val);
01421 slAddHead(&list, el);
01422 }
01423 }
01424 }
01425 }
01426 slReverse(&list);
01427 return list;
01428 }
01429
01430 struct slName *htmlPageLinks(struct htmlPage *page)
01431
01432 {
01433 return htmlPageScanAttribute(page, NULL, "HREF");
01434 }
01435
01436 struct htmlTableRow
01437
01438 {
01439 struct htmlTableRow *next;
01440 int tdCount;
01441 int inTd;
01442 };
01443
01444 struct htmlTable
01445
01446 {
01447 struct htmlTable *next;
01448 struct htmlTableRow *row;
01449 int rowCount;
01450 };
01451
01452 static void validateTables(struct htmlPage *page,
01453 struct htmlTag *startTag, struct htmlTag *endTag)
01454
01455
01456 {
01457 struct htmlTable *tableStack = NULL, *table;
01458 struct htmlTableRow *row;
01459 struct htmlTag *tag;
01460
01461 for (tag = startTag; tag != endTag; tag = tag->next)
01462 {
01463 if (sameWord(tag->name, "TABLE"))
01464 {
01465 if (tableStack != NULL)
01466 {
01467 if (tableStack->row == NULL || !tableStack->row->inTd)
01468 tagAbort(page, tag, "TABLE inside of another table, but not inside of <TR><TD>\n");
01469 }
01470 AllocVar(table);
01471 slAddHead(&tableStack, table);
01472 }
01473 else if (sameWord(tag->name, "/TABLE"))
01474 {
01475 if ((table = tableStack) == NULL)
01476 tagAbort(page, tag, "Extra </TABLE> tag");
01477 if (table->rowCount == 0)
01478 tagAbort(page, tag, "<TABLE> with no <TR>'s");
01479 if (table->row != NULL)
01480 tagAbort(page, tag, "</TABLE> inside of a row");
01481 tableStack = table->next;
01482 freez(&table);
01483 }
01484 else if (sameWord(tag->name, "TR"))
01485 {
01486 if ((table = tableStack) == NULL)
01487 tagAbort(page, tag, "<TR> outside of TABLE");
01488 if (table->row != NULL)
01489 tagAbort(page, tag, "<TR>...<TR> with no </TR> in between");
01490 AllocVar(table->row);
01491 table->rowCount += 1;
01492 }
01493 else if (sameWord(tag->name, "/TR"))
01494 {
01495 if ((table = tableStack) == NULL)
01496 tagAbort(page, tag, "</TR> outside of TABLE");
01497 if (table->row == NULL)
01498 tagAbort(page, tag, "</TR> with no <TR>");
01499 #ifdef LEGAL_ACTUALLY
01500 if (table->row->inTd)
01501 {
01502 tagAbort(page, tag, "</TR> while <TD> is open");
01503 }
01504 #endif
01505 if (table->row->tdCount == 0)
01506 tagAbort(page, tag, "Empty row in <TABLE>");
01507 freez(&table->row);
01508 }
01509 else if (sameWord(tag->name, "TD") || sameWord(tag->name, "TH"))
01510 {
01511 if ((table = tableStack) == NULL)
01512 tagAbort(page, tag, "<%s> outside of <TABLE>", tag->name);
01513 if ((row = table->row) == NULL)
01514 tagAbort(page, tag, "<%s> outside of <TR>", tag->name);
01515 #ifdef LEGAL_ACTUALLY
01516 if (row->inTd)
01517 {
01518 tagAbort(page, tag, "<%s>...<%s> with no </%s> in between",
01519 tag->name, tag->name, tag->name);
01520 }
01521 #endif
01522 row->inTd = TRUE;
01523 row->tdCount += 1;
01524 }
01525 else if (sameWord(tag->name, "/TD") || sameWord(tag->name, "/TH"))
01526 {
01527 if ((table = tableStack) == NULL)
01528 tagAbort(page, tag, "<%s> outside of <TABLE>", tag->name);
01529 if ((row = table->row) == NULL)
01530 tagAbort(page, tag, "<%s> outside of <TR>", tag->name);
01531 if (!row->inTd)
01532 tagAbort(page, tag, "<%s> with no <%s>", tag->name, tag->name+1);
01533 row->inTd = FALSE;
01534 }
01535 }
01536 if (tableStack != NULL)
01537 tagAbort(page, tag, "Missing </TABLE>");
01538 }
01539
01540 static void checkTagIsInside(struct htmlPage *page, char *outsiders, char *insiders,
01541 struct htmlTag *startTag, struct htmlTag *endTag)
01542
01543 {
01544 char *outDupe = cloneString(outsiders);
01545 char *inDupe = cloneString(insiders);
01546 char *line, *word;
01547 int depth = 0;
01548 struct htmlTag *tag;
01549 struct hash *outOpen = newHash(8);
01550 struct hash *outClose = newHash(8);
01551 struct hash *inHash = newHash(8);
01552 char buf[256];
01553
01554
01555 line = inDupe;
01556 while ((word = nextWord(&line)) != NULL)
01557 {
01558 touppers(word);
01559 hashAdd(inHash, word, NULL);
01560 }
01561
01562
01563 line = outDupe;
01564 while ((word = nextWord(&line)) != NULL)
01565 {
01566 touppers(word);
01567 hashAdd(outOpen, word, NULL);
01568 safef(buf, sizeof(buf), "/%s", word);
01569 hashAdd(outClose, buf, NULL);
01570 }
01571
01572
01573
01574 for (tag = startTag; tag != NULL; tag = tag->next)
01575 {
01576 char *type = tag->name;
01577 if (hashLookup(outOpen, type ))
01578 ++depth;
01579 else if (hashLookup(outClose, type))
01580 --depth;
01581 else if (hashLookup(inHash, type))
01582 {
01583 if (depth <= 0)
01584 tagAbort(page, tag, "%s outside of any of %s", type, outsiders);
01585 }
01586 }
01587 freeHash(&inHash);
01588 freeHash(&outOpen);
01589 freeHash(&outClose);
01590 freeMem(outDupe);
01591 freeMem(inDupe);
01592 }
01593
01594 static void checkNest(struct htmlPage *page,
01595 char *type, struct htmlTag *startTag, struct htmlTag *endTag)
01596
01597 {
01598 struct htmlTag *tag;
01599 int depth = 0;
01600 char endType[256];
01601 safef(endType, sizeof(endType), "/%s", type);
01602 for (tag = startTag; tag != endTag; tag = tag->next)
01603 {
01604 if (sameWord(tag->name, type))
01605 ++depth;
01606 else if (sameWord(tag->name, endType))
01607 {
01608 --depth;
01609 if (depth < 0)
01610 tagAbort(page, tag, "<%s> without preceding <%s>", endType, type);
01611 }
01612 }
01613 if (depth != 0)
01614 errAbort("Missing <%s> tag", endType);
01615 }
01616
01617 static void validateNestingTags(struct htmlPage *page,
01618 struct htmlTag *startTag, struct htmlTag *endTag,
01619 char *nesters[], int nesterCount)
01620
01621 {
01622 int i;
01623 for (i=0; i<nesterCount; ++i)
01624 checkNest(page, nesters[i], startTag, endTag);
01625 }
01626
01627 static char *bodyNesters[] =
01628
01629 {
01630 "ADDRESS", "DIV", "H1", "H2", "H3", "H4", "H5", "H6",
01631 "ACRONYM", "BLOCKQUOTE", "CITE", "CODE", "DEL", "DFN"
01632 "DIR", "DL", "MENU", "OL", "UL", "CAPTION", "TABLE",
01633 "A", "MAP", "OBJECT", "FORM"
01634 };
01635
01636 static char *headNesters[] =
01637
01638 {
01639 "TITLE",
01640 };
01641
01642 static struct htmlTag *validateBody(struct htmlPage *page, struct htmlTag *startTag)
01643
01644
01645 {
01646 struct htmlTag *tag, *endTag = NULL;
01647
01648
01649 for (tag = startTag; tag != NULL; tag = tag->next)
01650 {
01651 if (sameWord(tag->name, "/BODY"))
01652 {
01653 endTag = tag;
01654 break;
01655 }
01656 }
01657 if (endTag == NULL)
01658 errAbort("Missing </BODY>");
01659 validateTables(page, startTag, endTag);
01660 checkTagIsInside(page, "DIR MENU OL UL", "LI", startTag, endTag);
01661 checkTagIsInside(page, "DL", "DD DT", startTag, endTag);
01662 checkTagIsInside(page, "COLGROUP TABLE", "COL", startTag, endTag);
01663 checkTagIsInside(page, "MAP", "AREA", startTag, endTag);
01664 checkTagIsInside(page, "FORM",
01665 "INPUT BUTTON /BUTTON OPTION SELECT /SELECT TEXTAREA /TEXTAREA"
01666 "FIELDSET /FIELDSET"
01667 ,
01668 startTag, endTag);
01669 validateNestingTags(page, startTag, endTag, bodyNesters, ArraySize(bodyNesters));
01670 return endTag->next;
01671 }
01672
01673 static char *urlOkChars()
01674
01675
01676
01677 {
01678 char *okChars;
01679 int c;
01680 AllocArray(okChars, 256);
01681 for (c=0; c<256; ++c)
01682 if (isalnum(c))
01683 okChars[c] = 1;
01684
01685 okChars['='] = 1;
01686 okChars['-'] = 1;
01687 okChars['/'] = 1;
01688 okChars['%'] = 1;
01689 okChars['.'] = 1;
01690 okChars[';'] = 1;
01691 okChars[':'] = 1;
01692 okChars['_'] = 1;
01693 okChars['&'] = 1;
01694 okChars['+'] = 1;
01695 return okChars;
01696 }
01697
01698 static void validateCgiUrl(char *url)
01699
01700 {
01701 if (startsWith("http:", url) || startsWith("https:", url))
01702 {
01703 static char *okChars = NULL;
01704 UBYTE c, *s;
01705 if (okChars == NULL)
01706 okChars = urlOkChars();
01707 url = strchr(url, '?');
01708 if (url != NULL)
01709 {
01710 s = (UBYTE*)url+1;
01711 while ((c = *s++) != 0)
01712 {
01713 if (!okChars[c])
01714 {
01715 errAbort("Character %c not allowed in URL %s", c, url);
01716 }
01717 }
01718 }
01719 }
01720 }
01721
01722 static void validateCgiUrls(struct htmlPage *page)
01723
01724 {
01725 struct htmlForm *form;
01726 struct slName *linkList = htmlPageLinks(page), *link;
01727
01728 for (form = page->forms; form != NULL; form = form->next)
01729 validateCgiUrl(form->action);
01730 for (link = linkList; link != NULL; link = link->next)
01731 validateCgiUrl(link->name);
01732 slFreeList(&linkList);
01733 }
01734
01735 static int countTagsOfType(struct htmlTag *tagList, char *type)
01736
01737 {
01738 struct htmlTag *tag;
01739 int count = 0;
01740 for (tag = tagList; tag != NULL; tag = tag->next)
01741 if (sameString(tag->name, type))
01742 ++count;
01743 return count;
01744 }
01745
01746 static void checkExactlyOne(struct htmlTag *tagList, char *type)
01747
01748 {
01749 int count = countTagsOfType(tagList, type);
01750 if (count != 1)
01751 errAbort("Expecting exactly 1 <%s>, got %d", type, count);
01752 }
01753
01754
01755 void htmlPageFormOrAbort(struct htmlPage *page)
01756
01757 {
01758 if (page == NULL)
01759 errAbort("Can't validate NULL page");
01760 if (page->forms == NULL)
01761 errAbort("No form found");
01762 }
01763
01764 void htmlPageValidateOrAbort(struct htmlPage *page)
01765
01766 {
01767 struct htmlTag *tag;
01768 boolean gotTitle = FALSE;
01769 char *contentType = NULL;
01770
01771 if (page == NULL)
01772 errAbort("Can't validate NULL page");
01773 if (page->header != NULL)
01774 contentType = hashFindVal(page->header, "Content-Type:");
01775 if (contentType == NULL || startsWith("text/html", contentType))
01776 {
01777
01778 for (tag = page->tags; tag != NULL; tag = tag->next)
01779 touppers(tag->name);
01780
01781 checkExactlyOne(page->tags, "BODY");
01782
01783
01784 if ((tag = page->tags) == NULL)
01785 errAbort("No tags");
01786 if (!sameWord(tag->name, "HTML"))
01787 errAbort("Doesn't start with <HTML> tag");
01788 tag = tag->next;
01789 if (tag == NULL || !sameWord(tag->name, "HEAD"))
01790 warn("<HEAD> tag does not follow <HTML> tag");
01791 else
01792 {
01793 for (;;)
01794 {
01795 tag = tag->next;
01796 if (tag == NULL)
01797 errAbort("Missing </HEAD>");
01798 if (sameWord(tag->name, "TITLE"))
01799 gotTitle = TRUE;
01800 if (sameWord(tag->name, "/HEAD"))
01801 break;
01802 }
01803 if (!gotTitle)
01804 warn("No title in <HEAD>");
01805 validateNestingTags(page, page->tags, tag, headNesters, ArraySize(headNesters));
01806 tag = tag->next;
01807 }
01808 if (tag == NULL || !sameWord(tag->name, "BODY"))
01809 errAbort("<BODY> tag does not follow <HTML> tag");
01810 tag = validateBody(page, tag->next);
01811 if (tag == NULL || !sameWord(tag->name, "/HTML"))
01812 errAbort("Missing </HTML>");
01813 validateCgiUrls(page);
01814 }
01815 }
01816