00001 /* htmlPage - stuff to read, parse, and submit htmlPages and forms. 00002 * 00003 * typical usage is: 00004 * struct htmlPage *page = htmlPageGet(url); 00005 * htmlPageValidateOrAbort(page); 00006 * var = htmlPageGetVar(page, page->forms, "org"); 00007 * if (var != NULL) 00008 * printf("Organism = %s\n", var->curVal); 00009 * htmlPageSetVar(page, page->forms, "org", "Human"); 00010 * newPage = htmlPageFromForm(page, page->forms, "submit", "Go"); 00011 */ 00012 00013 #ifndef HTMLPAGE_H 00014 #define HTMLPAGE_H 00015 00016 #ifndef DYSTRING_H 00017 #include "dystring.h" 00018 #endif 00019 00020 struct htmlStatus 00021 /* HTTP version and status code. */ 00022 { 00023 struct htmlStatus *next; /* Next in list. */ 00024 char *version; /* Usually something like HTTP/1.1 */ 00025 int status; /* HTTP status code. 200 is good. */ 00026 }; 00027 00028 struct htmlCookie 00029 /* A cookie - stored by browser usually. We need to echo it 00030 * back when we post forms. */ 00031 { 00032 struct htmlCookie *next; /* Next in list. */ 00033 char *name; /* Cookie name. */ 00034 char *value; /* Cookie value. */ 00035 char *domain; /* The set of web domains this applies to. */ 00036 char *path; /* Cookie applies below this path I guess. */ 00037 char *expires; /* Expiration date. */ 00038 boolean secure; /* Is it a secure cookie? */ 00039 }; 00040 00041 struct htmlAttribute 00042 /* An html attribute - part of a set of name/values pairs in a tag. */ 00043 { 00044 struct htmlAttribute *next; 00045 char *name; /* Attribute name. */ 00046 char *val; /* Attribute value. */ 00047 }; 00048 00049 struct htmlTag 00050 /* A html tag - includes attribute list and parent, but no text. */ 00051 { 00052 struct htmlTag *next; 00053 char *name; /* Tag name. */ 00054 struct htmlAttribute *attributes; /* Attribute list. */ 00055 char *start; /* Start of this tag. Not allocated here.*/ 00056 char *end; /* End of tag (one past closing '>') Not allocated here.*/ 00057 }; 00058 00059 struct htmlFormVar 00060 /* A variable within an html form - from input, button, etc. */ 00061 { 00062 struct htmlFormVar *next; /* Next in list. */ 00063 char *name; /* Variable name. Not allocated here.*/ 00064 char *tagName; /* Name of tag. Not allocated here. */ 00065 char *type; /* Variable type. Not allocated here. */ 00066 char *curVal; /* Current value if any. Allocated here. */ 00067 struct slName *values; /* List of available values. Null if textBox. */ 00068 struct slRef *tags; /* List of references associated tags. */ 00069 }; 00070 00071 struct htmlForm 00072 /* A form within an html page. */ 00073 { 00074 struct htmlForm *next; /* Next form in list. */ 00075 char *name; /* Name (n/a if not defined). Not allocated here. */ 00076 char *action; /* Action attribute value. Not allocated here. */ 00077 char *method; /* Defaults to "GET". Not allocated here. */ 00078 struct htmlTag *startTag; /* Tag that holds <FORM>. Not allocated here. */ 00079 struct htmlTag *endTag; /* Tag one past </FORM> . Not allocated here. */ 00080 struct htmlFormVar *vars; /* List of form variables. */ 00081 }; 00082 00083 struct htmlPage 00084 /* A complete html page parsed out. */ 00085 { 00086 struct htmlPage *next; 00087 char *url; /* Url that produced this page. */ 00088 struct htmlStatus *status; /* Version and status. */ 00089 struct hash *header; /* Hash of header lines (cookies, etc.) */ 00090 struct htmlCookie *cookies; /* Associated cookies if any. */ 00091 char *fullText; /* Full unparsed text including headers. */ 00092 char *htmlText; /* Text unparsed after header. Same mem as fullText. */ 00093 struct htmlTag *tags; /* List of tags in this page. */ 00094 struct htmlForm *forms; /* List of all forms. */ 00095 }; 00096 00097 void htmlStatusFree(struct htmlStatus **pStatus); 00098 /* Free up resources associated with status */ 00099 00100 void htmlStatusFreeList(struct htmlStatus **pList); 00101 /* Free a list of dynamically allocated htmlStatus's */ 00102 00103 struct htmlStatus *htmlStatusParse(char **pText); 00104 /* Read in status from first line. Update pText to point to next line. 00105 * Note unlike many routines here, this does not insert zeros into text. */ 00106 00107 void htmlCookieFree(struct htmlCookie **pCookie); 00108 /* Free memory associated with cookie. */ 00109 00110 void htmlCookieFreeList(struct htmlCookie **pList); 00111 /* Free a list of dynamically allocated htmlCookie's */ 00112 00113 struct htmlCookie *htmlCookieFileRead(char *fileName); 00114 /* Read cookies from a line oriented file. First word in line 00115 * is the cookie name, the rest of the line the cookie value. */ 00116 00117 void htmlAttributeFree(struct htmlAttribute **pAttribute); 00118 /* Free up resources associated with attribute. */ 00119 00120 void htmlAttributeFreeList(struct htmlAttribute **pList); 00121 /* Free a list of dynamically allocated htmlAttribute's */ 00122 00123 char *htmlTagAttributeVal(struct htmlPage *page, struct htmlTag *tag, 00124 char *name, char *defaultVal); 00125 /* Return value of named attribute, or defaultVal if attribute doesn't exist. */ 00126 00127 char *htmlTagAttributeNeeded(struct htmlPage *page, struct htmlTag *tag, char *name); 00128 /* Return named tag attribute. Complain and return "n/a" if it 00129 * doesn't exist. */ 00130 00131 void htmlTagFree(struct htmlTag **pTag); 00132 /* Free up resources associated with tag. */ 00133 00134 void htmlTagFreeList(struct htmlTag **pList); 00135 /* Free a list of dynamically allocated htmlTag's */ 00136 00137 void htmlFormVarFree(struct htmlFormVar **pVar); 00138 /* Free up resources associated with form variable. */ 00139 00140 void htmlFormVarFreeList(struct htmlFormVar **pList); 00141 /* Free a list of dynamically allocated htmlFormVar's */ 00142 00143 void htmlFormVarPrint(struct htmlFormVar *var, FILE *f, char *prefix); 00144 /* Print out variable to file, prepending prefix. */ 00145 00146 void htmlFormFree(struct htmlForm **pForm); 00147 /* Free up resources associated with form variable. */ 00148 00149 void htmlFormFreeList(struct htmlForm **pList); 00150 /* Free a list of dynamically allocated htmlForm's */ 00151 00152 void htmlFormPrint(struct htmlForm *form, FILE *f); 00153 /* Print out form structure. */ 00154 00155 char *htmlFormCgiVars(struct htmlPage *page, struct htmlForm *form, 00156 char *buttonName, char *buttonVal, struct dyString *dyHeader); 00157 /* Return cgi vars in name=val format from use having pressed 00158 * submit button of given name and value. */ 00159 00160 struct htmlForm *htmlFormGet(struct htmlPage *page, char *name); 00161 /* Get named form. */ 00162 00163 struct htmlFormVar *htmlFormVarGet(struct htmlForm *form, char *name); 00164 /* Get named variable. */ 00165 00166 void htmlFormVarSet(struct htmlForm *form, char *name, char *val); 00167 /* Set variable to given value. */ 00168 00169 struct htmlFormVar *htmlPageGetVar(struct htmlPage *page, struct htmlForm *form, char *name); 00170 /* Get named variable. If form is NULL, first form in page is used. */ 00171 00172 void htmlPageSetVar(struct htmlPage *page, struct htmlForm *form, char *name, char *val); 00173 /* Set variable to given value. If form is NULL, first form in page is used. */ 00174 00175 void htmlPageFree(struct htmlPage **pPage); 00176 /* Free up resources associated with htmlPage. */ 00177 00178 void htmlPageFreeList(struct htmlPage **pList); 00179 /* Free a list of dynamically allocated htmlPage's */ 00180 00181 char *htmlExpandUrl(char *base, char *url); 00182 /* Expand URL that is relative to base to stand on it's own. 00183 * Return NULL if it's not http or https. */ 00184 00185 char *htmlNextCrLfLine(char **pS); 00186 /* Return zero-terminated line and advance *pS to start of 00187 * next line. Return NULL at end of file. Warn if there is 00188 * no <CR>. */ 00189 00190 struct slName *htmlPageScanAttribute(struct htmlPage *page, 00191 char *tagName, char *attribute); 00192 /* Scan page for values of particular attribute in particular tag. 00193 * if tag is NULL then scans in all tags. */ 00194 00195 struct slName *htmlPageLinks(struct htmlPage *page); 00196 /* Scan through tags list and pull out HREF attributes. */ 00197 00198 void htmlPageFormOrAbort(struct htmlPage *page); 00199 /* Aborts if no FORM found */ 00200 00201 void htmlPageValidateOrAbort(struct htmlPage *page); 00202 /* Do some basic validations. Aborts if there is a problem. */ 00203 00204 char *htmlSlurpWithCookies(char *url, struct htmlCookie *cookies); 00205 /* Send get message to url with cookies, and return full response as 00206 * a dyString. This is not parsed or validated, and includes http 00207 * header lines. Typically you'd pass this to htmlPageParse() to 00208 * get an actual page. */ 00209 00210 struct htmlPage *htmlPageParse(char *url, char *fullText); 00211 /* Parse out page and return. Warn and return NULL if problem. */ 00212 00213 struct htmlPage *htmlPageParseOk(char *url, char *fullText); 00214 /* Parse out page and return only if status ok. */ 00215 00216 struct htmlPage *htmlPageParseNoHead(char *url, char *htmlText); 00217 /* Parse out page in memory (past http header if any) and return. */ 00218 00219 struct htmlPage *htmlPageFromForm(struct htmlPage *origPage, struct htmlForm *form, 00220 char *buttonName, char *buttonVal); 00221 /* Return a new htmlPage based on response to pressing indicated button 00222 * on indicated form in origPage. */ 00223 00224 struct htmlPage *htmlPageGetWithCookies(char *url, struct htmlCookie *cookies); 00225 /* Get page from URL giving server the given cookies. Note only the 00226 * name and value parts of the cookies need to be filled in. */ 00227 00228 struct htmlPage *htmlPageGet(char *url); 00229 /* Get page from URL (may be a file). */ 00230 00231 struct htmlPage *htmlPageForwarded(char *url, struct htmlCookie *cookies); 00232 /* Get html page. If it's just a forwarding link then get do the 00233 * forwarding. Cookies is a possibly empty list of cookies with 00234 * name and value parts filled in. */ 00235 00236 struct htmlPage *htmlPageForwardedNoAbort(char *url, struct htmlCookie *cookies); 00237 /* Try and get an HTML page. Print warning and return NULL if there's a problem. */ 00238 #endif /* HTMLPAGE_H */ 00239
1.5.2