[964] | 1 | /* |
---|
| 2 | * Summary: interface for an HTML 4.0 non-verifying parser |
---|
| 3 | * Description: this module implements an HTML 4.0 non-verifying parser |
---|
| 4 | * with API compatible with the XML parser ones. It should |
---|
| 5 | * be able to parse "real world" HTML, even if severely |
---|
| 6 | * broken from a specification point of view. |
---|
| 7 | * |
---|
| 8 | * Copy: See Copyright for the status of this software. |
---|
| 9 | * |
---|
| 10 | * Author: Daniel Veillard |
---|
| 11 | */ |
---|
| 12 | |
---|
| 13 | #ifndef __HTML_PARSER_H__ |
---|
| 14 | #define __HTML_PARSER_H__ |
---|
| 15 | #include <libxml/xmlversion.h> |
---|
| 16 | #include <libxml/parser.h> |
---|
| 17 | |
---|
| 18 | #ifdef LIBXML_HTML_ENABLED |
---|
| 19 | |
---|
| 20 | #ifdef __cplusplus |
---|
| 21 | extern "C" { |
---|
| 22 | #endif |
---|
| 23 | |
---|
| 24 | /* |
---|
| 25 | * Most of the back-end structures from XML and HTML are shared. |
---|
| 26 | */ |
---|
| 27 | typedef xmlParserCtxt htmlParserCtxt; |
---|
| 28 | typedef xmlParserCtxtPtr htmlParserCtxtPtr; |
---|
| 29 | typedef xmlParserNodeInfo htmlParserNodeInfo; |
---|
| 30 | typedef xmlSAXHandler htmlSAXHandler; |
---|
| 31 | typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; |
---|
| 32 | typedef xmlParserInput htmlParserInput; |
---|
| 33 | typedef xmlParserInputPtr htmlParserInputPtr; |
---|
| 34 | typedef xmlDocPtr htmlDocPtr; |
---|
| 35 | typedef xmlNodePtr htmlNodePtr; |
---|
| 36 | |
---|
| 37 | /* |
---|
| 38 | * Internal description of an HTML element, representing HTML 4.01 |
---|
| 39 | * and XHTML 1.0 (which share the same structure). |
---|
| 40 | */ |
---|
| 41 | typedef struct _htmlElemDesc htmlElemDesc; |
---|
| 42 | typedef htmlElemDesc *htmlElemDescPtr; |
---|
| 43 | struct _htmlElemDesc { |
---|
| 44 | const char *name; /* The tag name */ |
---|
| 45 | char startTag; /* Whether the start tag can be implied */ |
---|
| 46 | char endTag; /* Whether the end tag can be implied */ |
---|
| 47 | char saveEndTag; /* Whether the end tag should be saved */ |
---|
| 48 | char empty; /* Is this an empty element ? */ |
---|
| 49 | char depr; /* Is this a deprecated element ? */ |
---|
| 50 | char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ |
---|
| 51 | char isinline; /* is this a block 0 or inline 1 element */ |
---|
| 52 | const char *desc; /* the description */ |
---|
| 53 | |
---|
| 54 | /* NRK Jan.2003 |
---|
| 55 | * New fields encapsulating HTML structure |
---|
| 56 | * |
---|
| 57 | * Bugs: |
---|
| 58 | * This is a very limited representation. It fails to tell us when |
---|
| 59 | * an element *requires* subelements (we only have whether they're |
---|
| 60 | * allowed or not), and it doesn't tell us where CDATA and PCDATA |
---|
| 61 | * are allowed. Some element relationships are not fully represented: |
---|
| 62 | * these are flagged with the word MODIFIER |
---|
| 63 | */ |
---|
| 64 | const char** subelts; /* allowed sub-elements of this element */ |
---|
| 65 | const char* defaultsubelt; /* subelement for suggested auto-repair |
---|
| 66 | if necessary or NULL */ |
---|
| 67 | const char** attrs_opt; /* Optional Attributes */ |
---|
| 68 | const char** attrs_depr; /* Additional deprecated attributes */ |
---|
| 69 | const char** attrs_req; /* Required attributes */ |
---|
| 70 | }; |
---|
| 71 | |
---|
| 72 | /* |
---|
| 73 | * Internal description of an HTML entity. |
---|
| 74 | */ |
---|
| 75 | typedef struct _htmlEntityDesc htmlEntityDesc; |
---|
| 76 | typedef htmlEntityDesc *htmlEntityDescPtr; |
---|
| 77 | struct _htmlEntityDesc { |
---|
| 78 | unsigned int value; /* the UNICODE value for the character */ |
---|
| 79 | const char *name; /* The entity name */ |
---|
| 80 | const char *desc; /* the description */ |
---|
| 81 | }; |
---|
| 82 | |
---|
| 83 | /* |
---|
| 84 | * There is only few public functions. |
---|
| 85 | */ |
---|
| 86 | XMLPUBFUN const htmlElemDesc * XMLCALL |
---|
| 87 | htmlTagLookup (const xmlChar *tag); |
---|
| 88 | XMLPUBFUN const htmlEntityDesc * XMLCALL |
---|
| 89 | htmlEntityLookup(const xmlChar *name); |
---|
| 90 | XMLPUBFUN const htmlEntityDesc * XMLCALL |
---|
| 91 | htmlEntityValueLookup(unsigned int value); |
---|
| 92 | |
---|
| 93 | XMLPUBFUN int XMLCALL |
---|
| 94 | htmlIsAutoClosed(htmlDocPtr doc, |
---|
| 95 | htmlNodePtr elem); |
---|
| 96 | XMLPUBFUN int XMLCALL |
---|
| 97 | htmlAutoCloseTag(htmlDocPtr doc, |
---|
| 98 | const xmlChar *name, |
---|
| 99 | htmlNodePtr elem); |
---|
| 100 | XMLPUBFUN const htmlEntityDesc * XMLCALL |
---|
| 101 | htmlParseEntityRef(htmlParserCtxtPtr ctxt, |
---|
| 102 | const xmlChar **str); |
---|
| 103 | XMLPUBFUN int XMLCALL |
---|
| 104 | htmlParseCharRef(htmlParserCtxtPtr ctxt); |
---|
| 105 | XMLPUBFUN void XMLCALL |
---|
| 106 | htmlParseElement(htmlParserCtxtPtr ctxt); |
---|
| 107 | |
---|
| 108 | XMLPUBFUN htmlParserCtxtPtr XMLCALL |
---|
| 109 | htmlCreateMemoryParserCtxt(const char *buffer, |
---|
| 110 | int size); |
---|
| 111 | |
---|
| 112 | XMLPUBFUN int XMLCALL |
---|
| 113 | htmlParseDocument(htmlParserCtxtPtr ctxt); |
---|
| 114 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 115 | htmlSAXParseDoc (xmlChar *cur, |
---|
| 116 | const char *encoding, |
---|
| 117 | htmlSAXHandlerPtr sax, |
---|
| 118 | void *userData); |
---|
| 119 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 120 | htmlParseDoc (xmlChar *cur, |
---|
| 121 | const char *encoding); |
---|
| 122 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 123 | htmlSAXParseFile(const char *filename, |
---|
| 124 | const char *encoding, |
---|
| 125 | htmlSAXHandlerPtr sax, |
---|
| 126 | void *userData); |
---|
| 127 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 128 | htmlParseFile (const char *filename, |
---|
| 129 | const char *encoding); |
---|
| 130 | XMLPUBFUN int XMLCALL |
---|
| 131 | UTF8ToHtml (unsigned char *out, |
---|
| 132 | int *outlen, |
---|
| 133 | const unsigned char *in, |
---|
| 134 | int *inlen); |
---|
| 135 | XMLPUBFUN int XMLCALL |
---|
| 136 | htmlEncodeEntities(unsigned char *out, |
---|
| 137 | int *outlen, |
---|
| 138 | const unsigned char *in, |
---|
| 139 | int *inlen, int quoteChar); |
---|
| 140 | XMLPUBFUN int XMLCALL |
---|
| 141 | htmlIsScriptAttribute(const xmlChar *name); |
---|
| 142 | XMLPUBFUN int XMLCALL |
---|
| 143 | htmlHandleOmittedElem(int val); |
---|
| 144 | |
---|
| 145 | #ifdef LIBXML_PUSH_ENABLED |
---|
| 146 | /** |
---|
| 147 | * Interfaces for the Push mode. |
---|
| 148 | */ |
---|
| 149 | XMLPUBFUN htmlParserCtxtPtr XMLCALL |
---|
| 150 | htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, |
---|
| 151 | void *user_data, |
---|
| 152 | const char *chunk, |
---|
| 153 | int size, |
---|
| 154 | const char *filename, |
---|
| 155 | xmlCharEncoding enc); |
---|
| 156 | XMLPUBFUN int XMLCALL |
---|
| 157 | htmlParseChunk (htmlParserCtxtPtr ctxt, |
---|
| 158 | const char *chunk, |
---|
| 159 | int size, |
---|
| 160 | int terminate); |
---|
| 161 | #endif /* LIBXML_PUSH_ENABLED */ |
---|
| 162 | |
---|
| 163 | XMLPUBFUN void XMLCALL |
---|
| 164 | htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); |
---|
| 165 | |
---|
| 166 | /* |
---|
| 167 | * New set of simpler/more flexible APIs |
---|
| 168 | */ |
---|
| 169 | /** |
---|
| 170 | * xmlParserOption: |
---|
| 171 | * |
---|
| 172 | * This is the set of XML parser options that can be passed down |
---|
| 173 | * to the xmlReadDoc() and similar calls. |
---|
| 174 | */ |
---|
| 175 | typedef enum { |
---|
| 176 | HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ |
---|
| 177 | HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ |
---|
| 178 | HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ |
---|
| 179 | HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ |
---|
| 180 | HTML_PARSE_NONET = 1<<11 /* Forbid network access */ |
---|
| 181 | } htmlParserOption; |
---|
| 182 | |
---|
| 183 | XMLPUBFUN void XMLCALL |
---|
| 184 | htmlCtxtReset (htmlParserCtxtPtr ctxt); |
---|
| 185 | XMLPUBFUN int XMLCALL |
---|
| 186 | htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, |
---|
| 187 | int options); |
---|
| 188 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 189 | htmlReadDoc (const xmlChar *cur, |
---|
| 190 | const char *URL, |
---|
| 191 | const char *encoding, |
---|
| 192 | int options); |
---|
| 193 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 194 | htmlReadFile (const char *URL, |
---|
| 195 | const char *encoding, |
---|
| 196 | int options); |
---|
| 197 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 198 | htmlReadMemory (const char *buffer, |
---|
| 199 | int size, |
---|
| 200 | const char *URL, |
---|
| 201 | const char *encoding, |
---|
| 202 | int options); |
---|
| 203 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 204 | htmlReadFd (int fd, |
---|
| 205 | const char *URL, |
---|
| 206 | const char *encoding, |
---|
| 207 | int options); |
---|
| 208 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 209 | htmlReadIO (xmlInputReadCallback ioread, |
---|
| 210 | xmlInputCloseCallback ioclose, |
---|
| 211 | void *ioctx, |
---|
| 212 | const char *URL, |
---|
| 213 | const char *encoding, |
---|
| 214 | int options); |
---|
| 215 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 216 | htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, |
---|
| 217 | const xmlChar *cur, |
---|
| 218 | const char *URL, |
---|
| 219 | const char *encoding, |
---|
| 220 | int options); |
---|
| 221 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 222 | htmlCtxtReadFile (xmlParserCtxtPtr ctxt, |
---|
| 223 | const char *filename, |
---|
| 224 | const char *encoding, |
---|
| 225 | int options); |
---|
| 226 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 227 | htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, |
---|
| 228 | const char *buffer, |
---|
| 229 | int size, |
---|
| 230 | const char *URL, |
---|
| 231 | const char *encoding, |
---|
| 232 | int options); |
---|
| 233 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 234 | htmlCtxtReadFd (xmlParserCtxtPtr ctxt, |
---|
| 235 | int fd, |
---|
| 236 | const char *URL, |
---|
| 237 | const char *encoding, |
---|
| 238 | int options); |
---|
| 239 | XMLPUBFUN htmlDocPtr XMLCALL |
---|
| 240 | htmlCtxtReadIO (xmlParserCtxtPtr ctxt, |
---|
| 241 | xmlInputReadCallback ioread, |
---|
| 242 | xmlInputCloseCallback ioclose, |
---|
| 243 | void *ioctx, |
---|
| 244 | const char *URL, |
---|
| 245 | const char *encoding, |
---|
| 246 | int options); |
---|
| 247 | |
---|
| 248 | /* NRK/Jan2003: further knowledge of HTML structure |
---|
| 249 | */ |
---|
| 250 | typedef enum { |
---|
| 251 | HTML_NA = 0 , /* something we don't check at all */ |
---|
| 252 | HTML_INVALID = 0x1 , |
---|
| 253 | HTML_DEPRECATED = 0x2 , |
---|
| 254 | HTML_VALID = 0x4 , |
---|
| 255 | HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ |
---|
| 256 | } htmlStatus ; |
---|
| 257 | |
---|
| 258 | /* Using htmlElemDesc rather than name here, to emphasise the fact |
---|
| 259 | that otherwise there's a lookup overhead |
---|
| 260 | */ |
---|
| 261 | XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; |
---|
| 262 | XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; |
---|
| 263 | XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; |
---|
| 264 | XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; |
---|
| 265 | /** |
---|
| 266 | * htmlDefaultSubelement: |
---|
| 267 | * @elt: HTML element |
---|
| 268 | * |
---|
| 269 | * Returns the default subelement for this element |
---|
| 270 | */ |
---|
| 271 | #define htmlDefaultSubelement(elt) elt->defaultsubelt |
---|
| 272 | /** |
---|
| 273 | * htmlElementAllowedHereDesc: |
---|
| 274 | * @parent: HTML parent element |
---|
| 275 | * @elt: HTML element |
---|
| 276 | * |
---|
| 277 | * Checks whether an HTML element description may be a |
---|
| 278 | * direct child of the specified element. |
---|
| 279 | * |
---|
| 280 | * Returns 1 if allowed; 0 otherwise. |
---|
| 281 | */ |
---|
| 282 | #define htmlElementAllowedHereDesc(parent,elt) \ |
---|
| 283 | htmlElementAllowedHere((parent), (elt)->name) |
---|
| 284 | /** |
---|
| 285 | * htmlRequiredAttrs: |
---|
| 286 | * @elt: HTML element |
---|
| 287 | * |
---|
| 288 | * Returns the attributes required for the specified element. |
---|
| 289 | */ |
---|
| 290 | #define htmlRequiredAttrs(elt) (elt)->attrs_req |
---|
| 291 | |
---|
| 292 | |
---|
| 293 | #ifdef __cplusplus |
---|
| 294 | } |
---|
| 295 | #endif |
---|
| 296 | |
---|
| 297 | #endif /* LIBXML_HTML_ENABLED */ |
---|
| 298 | #endif /* __HTML_PARSER_H__ */ |
---|