#ifndef __LEXER_H__ #define __LEXER_H__ /**************************************************************************//** * @file * Lexer for HTML and XML Parsers. * * Given an input source, it returns a sequence of tokens. * * GetToken(source) gets the next token * UngetToken(source) provides one level undo * * The tags include an attribute list: * * - linked list of attribute/value nodes * - each node has 2 NULL-terminated strings. * - entities are replaced in attribute values * * white space is compacted if not in preformatted mode * If not in preformatted mode then leading white space * is discarded and subsequent white space sequences * compacted to single space characters. * * If XmlTags is no then Tag names are folded to upper * case and attribute names to lower case. * * Not yet done: * - Doctype subset and marked sections * * @author HTACG, et al (consult git log) * * @copyright * (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG. * See tidy.h for the copyright notice. * @par * All Rights Reserved. * @par * See `tidy.h` for the complete license. * * @date Additional updates: consult git log * ******************************************************************************/ #ifdef __cplusplus extern "C" { #endif #include "forward.h" /** @addtogroup internal_api */ /** @{ */ /***************************************************************************//** ** @defgroup lexer_h HTML and XML Lexing ** ** These functions and structures form the internal API for document ** lexing. ** ** @{ ******************************************************************************/ /** * Lexer character types. */ #define digit 1u #define letter 2u #define namechar 4u #define white 8u #define newline 16u #define lowercase 32u #define uppercase 64u #define digithex 128u /** * node->type is one of these values */ typedef enum { RootNode, DocTypeTag, CommentTag, ProcInsTag, TextNode, StartTag, EndTag, StartEndTag, CDATATag, SectionTag, AspTag, JsteTag, PhpTag, XmlDecl } NodeType; /** * Lexer GetToken() states. */ typedef enum { LEX_CONTENT, LEX_GT, LEX_ENDTAG, LEX_STARTTAG, LEX_COMMENT, LEX_DOCTYPE, LEX_PROCINSTR, LEX_CDATA, LEX_SECTION, LEX_ASP, LEX_JSTE, LEX_PHP, LEX_XMLDECL } LexerState; /** * ParseDocTypeDecl state constants. */ typedef enum { DT_INTERMEDIATE, DT_DOCTYPENAME, DT_PUBLICSYSTEM, DT_QUOTEDSTRING, DT_INTSUBSET } ParseDocTypeDeclState; /** * Content model shortcut encoding. * Descriptions are tentative. */ #define CM_UNKNOWN 0 #define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */ #define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */ #define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */ #define CM_BLOCK (1 << 3) /**< HTML "block" elements. */ #define CM_INLINE (1 << 4) /**< HTML "inline" elements. */ #define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */ #define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */ #define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */ #define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */ #define CM_ROW (1 << 9) /**< Used for "TD", "TH" */ #define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */ #define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */ #define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */ #define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ #define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */ #define CM_OPT (1 << 15) /**< Elements with an optional end tag. */ #define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */ #define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */ #define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */ #define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */ #define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */ #define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */ #define CM_VOID (1 << 22) /**< Elements that are void per https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#syntax-elements. */ /** * If the document uses just HTML 2.0 tags and attributes described * it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. * If there are proprietary tags and attributes then describe it as * HTML Proprietary. If it includes the xml-lang or xmlns attributes * but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the * flavors of Voyager (strict, loose or frameset). */ /* unknown */ #define xxxx 0u /* W3C defined HTML/XHTML family document types */ #define HT20 1u #define HT32 2u #define H40S 4u #define H40T 8u #define H40F 16u #define H41S 32u #define H41T 64u #define H41F 128u #define X10S 256u #define X10T 512u #define X10F 1024u #define XH11 2048u #define XB10 4096u /* proprietary stuff */ #define VERS_SUN 8192u #define VERS_NETSCAPE 16384u #define VERS_MICROSOFT 32768u /* special flag */ #define VERS_XML 65536u /* HTML5 */ #define HT50 131072u #define XH50 262144u /* compatibility symbols */ #define VERS_UNKNOWN (xxxx) #define VERS_HTML20 (HT20) #define VERS_HTML32 (HT32) #define VERS_HTML40_STRICT (H40S|H41S|X10S) #define VERS_HTML40_LOOSE (H40T|H41T|X10T) #define VERS_FRAMESET (H40F|H41F|X10F) #define VERS_XHTML11 (XH11) #define VERS_BASIC (XB10) /* HTML5 */ #define VERS_HTML5 (HT50|XH50) /* meta symbols */ #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET) #define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET) #define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME) #define VERS_EVENTS (VERS_HTML40|VERS_XHTML11) #define VERS_FROM32 (VERS_HTML32|VERS_HTML40|HT50) #define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC|VERS_HTML5) #define VERS_XHTML (X10S|X10T|X10F|XH11|XB10|XH50) /* strict */ #define VERS_STRICT (VERS_HTML5|VERS_HTML40_STRICT) /* all W3C defined document types */ #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50) /* all proprietary types */ #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) /** * Linked list of class names and styles */ struct _Style; typedef struct _Style TagStyle; struct _Style { tmbstr tag; tmbstr tag_class; tmbstr properties; TagStyle *next; }; /** * Linked list of style properties */ struct _StyleProp; typedef struct _StyleProp StyleProp; struct _StyleProp { tmbstr name; tmbstr value; StyleProp *next; }; /** * Attribute/Value linked list node */ struct _AttVal { AttVal* next; const Attribute* dict; Node* asp; Node* php; int delim; tmbstr attribute; tmbstr value; }; /** * Mosaic handles inlines via a separate stack from other elements * We duplicate this to recover from inline markup errors such as: * ~~~ * italic text *
more italic text normal text * ~~~ * which for compatibility with Mosaic is mapped to: * ~~~ * italic text *
more italic text normal text * ~~~ * Note that any inline end tag pop's the effect of the current * inline start tag, so that `` pop's `` in the above example. */ struct _IStack { IStack* next; const Dict* tag; /**< tag's dictionary definition */ tmbstr element; /**< name (NULL for text nodes) */ AttVal* attributes; }; /** * HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc. */ struct _Node { Node* parent; /**< tree structure */ Node* prev; Node* next; Node* content; Node* last; AttVal* attributes; const Dict* was; /**< old tag when it was changed */ const Dict* tag; /**< tag's dictionary definition */ tmbstr element; /**< name (NULL for text nodes) */ uint start; /**< start of span onto text array */ uint end; /**< end of span onto text array */ NodeType type; /**< TextNode, StartTag, EndTag etc. */ uint line; /**< current line of document */ uint column; /**< current column of document */ int idx; /**< general purpose register */ Bool closed; /**< true if closed by explicit end tag */ Bool implicit; /**< true if inferred */ Bool linebreak; /**< true if followed by a line break */ }; /** * The following are private to the lexer. * Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it. */ struct _Lexer { uint lines; /**< lines seen */ uint columns; /**< at start of current token */ Bool waswhite; /**< used to collapse contiguous white space */ Bool pushed; /**< true after token has been pushed back */ Bool insertspace; /**< when space is moved after end tag */ Bool excludeBlocks; /**< Netscape compatibility */ Bool exiled; /**< true if moved out of table */ Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */ uint versions; /**< bit vector of HTML versions */ uint doctype; /**< version as given by doctype (if any) */ uint versionEmitted; /**< version of doctype emitted */ Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */ uint txtstart; /**< start of current node */ uint txtend; /**< end of current node */ LexerState state; /**< state of lexer's finite state machine */ Node* token; /**< last token returned by GetToken() */ Node* itoken; /**< last duplicate inline returned by GetToken() */ Node* root; /**< remember root node of the document */ Node* parent; /**< remember parent node for CDATA elements */ Bool seenEndBody; /**< true if a `