diff --git a/src/clean.c b/src/clean.c index 5e2b936..e314ba6 100644 --- a/src/clean.c +++ b/src/clean.c @@ -1585,11 +1585,16 @@ void TY_(List2BQ)( TidyDocImpl* doc, Node* node ) */ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node ) { + Stack *stack = TY_(newStack)(doc, 16); + Node *next; + tmbchar indent_buf[ 32 ]; uint indent; while (node) { + next = node->next; + if ( nodeIsBLOCKQUOTE(node) && node->implicit ) { indent = 1; @@ -1602,19 +1607,27 @@ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node ) StripOnlyChild( doc, node ); } - if (node->content) - TY_(BQ2Div)( doc, node->content ); - TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem", 2*indent); RenameElem( doc, node, TidyTag_DIV ); TY_(AddStyleProperty)(doc, node, indent_buf ); + + if (node->content) + { + TY_(push)(stack, next); + node = node->content; + continue; + } } else if (node->content) - TY_(BQ2Div)( doc, node->content ); + { + TY_(push)(stack, next); + node = node->content; + continue; + } - node = node->next; + node = next ? next : TY_(pop)(stack); } } @@ -2736,30 +2749,42 @@ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId) */ static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent) { - Node *next; - while (node) - { - next = node->next; /* get 'next' now , in case the node is moved */ - /* dbg_show_node(doc, node, 0, indent); */ - if (nodeIsSTYLE(node)) - { - if (fix) - { - TY_(RemoveNode)(node); /* unhook style node from body */ - TY_(InsertNodeAtEnd)(head, node); /* add to end of head */ - TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */ - } - else - { - TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY); - } - } - else if (node->content) - { - StyleToHead(doc, head, node->content, fix, indent + 1); - } - node = next; /* process the 'next', if any */ - } + Stack *stack = TY_(newStack)(doc, 16); + Node *next; + + while (node) + { + next = node->next; + + if (nodeIsSTYLE(node)) + { + if (fix) + { + TY_(RemoveNode)(node); /* unhook style node from body */ + TY_(InsertNodeAtEnd)(head, node); /* add to end of head */ + TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */ + } + else + { + TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY); + } + } + else if (node->content) + { + TY_(push)(stack, next); + node = node->content; + indent++; + continue; + } + + if (next) + node = next; + else + { + node = TY_(pop)(stack); + indent--; + } + } } diff --git a/src/lexer.c b/src/lexer.c index 0fe5dd6..fa8d6fb 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str ) return 0; } -/* - node->type is one of these: - - #define TextNode 1 - #define StartTag 2 - #define EndTag 3 - #define StartEndTag 4 -*/ - Lexer* TY_(NewLexer)( TidyDocImpl* doc ) { Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) ); @@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node ) } } #endif - /* this is no good ;=(( - if (node && doc && doc->lexer) { - if (node == doc->lexer->token) { - doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer ); - } - } - ----------------- */ + while ( node ) { Node* next = node->next; @@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc) return NULL; } -/* - * local variables: - * mode: c - * indent-tabs-mode: nil - * c-basic-offset: 4 - * eval: (c-set-offset 'substatement-open 0) - * end: + +/****************************************************************************//* + ** MARK: - Node Stack + ***************************************************************************/ + + +/** + * Create a new stack with a given starting capacity. If memory allocation + * fails, then the allocator will panic the program automatically. */ +Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity) +{ + Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack)); + stack->top = -1; + stack->capacity = capacity; + stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**)); + stack->allocator = doc->allocator; + return stack; +} + + +/** + * Increase the stack size. This will be called automatically when the + * current stack is full. If memory allocation fails, then the allocator + * will panic the program automatically. + */ +void TY_(growStack)(Stack *stack) +{ + uint new_capacity = stack->capacity * 2; + + Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity); + + memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) ); + TidyFree(stack->allocator, stack->firstNode); + + stack->firstNode = firstNode; + stack->capacity = new_capacity; +} + + +/** + * Stack is full when top is equal to the last index. + */ +Bool TY_(stackFull)(Stack *stack) +{ + return stack->top == stack->capacity - 1; +} + + +/** + * Stack is empty when top is equal to -1 + */ +Bool TY_(stackEmpty)(Stack *stack) +{ + return stack->top == -1; +} + + +/** + * Push an item to the stack. + */ +void TY_(push)(Stack *stack, Node *node) +{ + if (TY_(stackFull)(stack)) + TY_(growStack)(stack); + + if (node) + stack->firstNode[++stack->top] = node; +} + + +/** + * Pop an item from the stack. + */ +Node* TY_(pop)(Stack *stack) +{ + return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; +} + + +/** + * Peek at the stack. + */ +FUNC_UNUSED Node* TY_(peek)(Stack *stack) +{ + return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; +} + +/** + * Frees the stack when done. + */ +void TY_(freeStack)(Stack *stack) +{ + TidyFree( stack->allocator, stack->firstNode ); + stack->top = -1; + stack->capacity = 0; + stack->firstNode = NULL; + stack->allocator = NULL; +} diff --git a/src/lexer.h b/src/lexer.h index 113a9f4..9d49898 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -1,33 +1,46 @@ #ifndef __LEXER_H__ #define __LEXER_H__ -/* lexer.h -- Lexer for html parser - - (c) 1998-2008 (W3C) MIT, ERCIM, Keio University - See tidy.h for the copyright notice. - Given an input source, it returns a sequence of tokens. - - GetToken(source) gets the next token - UngetToken(source) provides one level undo - - The tags include an attribute list: - - - linked list of attribute/value nodes - - each node has 2 NULL-terminated strings. - - entities are replaced in attribute values - - white space is compacted if not in preformatted mode - If not in preformatted mode then leading white space - is discarded and subsequent white space sequences - compacted to single space characters. - - If XmlTags is no then Tag names are folded to upper - case and attribute names to lower case. - - Not yet done: - - Doctype subset and marked sections -*/ +/**************************************************************************//** + * @file + * Lexer for HTML and XML Parsers. + * + * Given an input source, it returns a sequence of tokens. + * + * GetToken(source) gets the next token + * UngetToken(source) provides one level undo + * + * The tags include an attribute list: + * + * - linked list of attribute/value nodes + * - each node has 2 NULL-terminated strings. + * - entities are replaced in attribute values + * + * white space is compacted if not in preformatted mode + * If not in preformatted mode then leading white space + * is discarded and subsequent white space sequences + * compacted to single space characters. + * + * If XmlTags is no then Tag names are folded to upper + * case and attribute names to lower case. + * + * Not yet done: + * - Doctype subset and marked sections + * + * @author HTACG, et al (consult git log) + * + * @copyright + * (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG. + * See tidy.h for the copyright notice. + * @par + * All Rights Reserved. + * @par + * See `tidy.h` for the complete license. + * + * @date Additional updates: consult git log + * + ******************************************************************************/ #ifdef __cplusplus extern "C" { @@ -35,8 +48,23 @@ extern "C" { #include "forward.h" -/* lexer character types -*/ +/** @addtogroup internal_api */ +/** @{ */ + + +/***************************************************************************//** + ** @defgroup lexer_h HTML and XML Lexing + ** + ** These functions and structures form the internal API for document + ** lexing. + ** + ** @{ + ******************************************************************************/ + + +/** + * Lexer character types. + */ #define digit 1u #define letter 2u #define namechar 4u @@ -47,8 +75,9 @@ extern "C" { #define digithex 128u -/* node->type is one of these values -*/ +/** + * node->type is one of these values + */ typedef enum { RootNode, @@ -68,9 +97,9 @@ typedef enum } NodeType; - -/* lexer GetToken states -*/ +/** + * Lexer GetToken() states. + */ typedef enum { LEX_CONTENT, @@ -88,7 +117,10 @@ typedef enum LEX_XMLDECL } LexerState; -/* ParseDocTypeDecl state constants */ + +/** + * ParseDocTypeDecl state constants. + */ typedef enum { DT_INTERMEDIATE, @@ -98,67 +130,44 @@ typedef enum DT_INTSUBSET } ParseDocTypeDeclState; -/* content model shortcut encoding - Descriptions are tentative. -*/ +/** + * Content model shortcut encoding. + * Descriptions are tentative. + */ #define CM_UNKNOWN 0 -/* Elements with no content. Map to HTML specification. */ -#define CM_EMPTY (1 << 0) -/* Elements that appear outside of "BODY". */ -#define CM_HTML (1 << 1) -/* Elements that can appear within HEAD. */ -#define CM_HEAD (1 << 2) -/* HTML "block" elements. */ -#define CM_BLOCK (1 << 3) -/* HTML "inline" elements. */ -#define CM_INLINE (1 << 4) -/* Elements that mark list item ("LI"). */ -#define CM_LIST (1 << 5) -/* Elements that mark definition list item ("DL", "DT"). */ -#define CM_DEFLIST (1 << 6) -/* Elements that can appear inside TABLE. */ -#define CM_TABLE (1 << 7) -/* Used for "THEAD", "TFOOT" or "TBODY". */ -#define CM_ROWGRP (1 << 8) -/* Used for "TD", "TH" */ -#define CM_ROW (1 << 9) -/* Elements whose content must be protected against white space movement. - Includes some elements that can found in forms. */ -#define CM_FIELD (1 << 10) -/* Used to avoid propagating inline emphasis inside some elements - such as OBJECT or APPLET. */ -#define CM_OBJECT (1 << 11) -/* Elements that allows "PARAM". */ -#define CM_PARAM (1 << 12) -/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ -#define CM_FRAMES (1 << 13) -/* Heading elements (h1, h2, ...). */ -#define CM_HEADING (1 << 14) -/* Elements with an optional end tag. */ -#define CM_OPT (1 << 15) -/* Elements that use "align" attribute for vertical position. */ -#define CM_IMG (1 << 16) -/* Elements with inline and block model. Used to avoid calling InlineDup. */ -#define CM_MIXED (1 << 17) -/* Elements whose content needs to be indented only if containing one - CM_BLOCK element. */ -#define CM_NO_INDENT (1 << 18) -/* Elements that are obsolete (such as "dir", "menu"). */ -#define CM_OBSOLETE (1 << 19) -/* User defined elements. Used to determine how attributes without value - should be printed. */ -#define CM_NEW (1 << 20) -/* Elements that cannot be omitted. */ -#define CM_OMITST (1 << 21) +#define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */ +#define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */ +#define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */ +#define CM_BLOCK (1 << 3) /**< HTML "block" elements. */ +#define CM_INLINE (1 << 4) /**< HTML "inline" elements. */ +#define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */ +#define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */ +#define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */ +#define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */ +#define CM_ROW (1 << 9) /**< Used for "TD", "TH" */ +#define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */ +#define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */ +#define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */ +#define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ +#define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */ +#define CM_OPT (1 << 15) /**< Elements with an optional end tag. */ +#define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */ +#define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */ +#define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */ +#define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */ +#define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */ +#define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */ -/* If the document uses just HTML 2.0 tags and attributes described -** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. -** If there are proprietary tags and attributes then describe it as -** HTML Proprietary. If it includes the xml-lang or xmlns attributes -** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the -** flavors of Voyager (strict, loose or frameset). -*/ + +/** + * If the document uses just HTML 2.0 tags and attributes described + * it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. + * If there are proprietary tags and attributes then describe it as + * HTML Proprietary. If it includes the xml-lang or xmlns attributes + * but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the + * flavors of Voyager (strict, loose or frameset). + */ /* unknown */ #define xxxx 0u @@ -220,8 +229,10 @@ typedef enum /* all proprietary types */ #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) -/* Linked list of class names and styles -*/ + +/** + * Linked list of class names and styles + */ struct _Style; typedef struct _Style TagStyle; @@ -234,8 +245,9 @@ struct _Style }; -/* Linked list of style properties -*/ +/** + * Linked list of style properties + */ struct _StyleProp; typedef struct _StyleProp StyleProp; @@ -247,11 +259,9 @@ struct _StyleProp }; - - -/* Attribute/Value linked list node -*/ - +/** + * Attribute/Value linked list node + */ struct _AttVal { AttVal* next; @@ -264,93 +274,89 @@ struct _AttVal }; - -/* - Mosaic handles inlines via a separate stack from other elements - We duplicate this to recover from inline markup errors such as: - - italic text -
more italic text normal text - - which for compatibility with Mosaic is mapped to: - - italic text -
more italic text normal text - - Note that any inline end tag pop's the effect of the current - inline start tag, so that pop's in the above example. +/** + * Mosaic handles inlines via a separate stack from other elements + * We duplicate this to recover from inline markup errors such as: + * ~~~ + * italic text + *
more italic text normal text + * ~~~ + * which for compatibility with Mosaic is mapped to: + * ~~~ + * italic text + *
more italic text normal text + * ~~~ + * Note that any inline end tag pop's the effect of the current + * inline start tag, so that `` pop's `` in the above example. */ struct _IStack { IStack* next; - const Dict* tag; /* tag's dictionary definition */ - tmbstr element; /* name (NULL for text nodes) */ + const Dict* tag; /**< tag's dictionary definition */ + tmbstr element; /**< name (NULL for text nodes) */ AttVal* attributes; }; -/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, -** etc. etc. -*/ - +/** + * HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc. + */ struct _Node { - Node* parent; /* tree structure */ + Node* parent; /**< tree structure */ Node* prev; Node* next; Node* content; Node* last; AttVal* attributes; - const Dict* was; /* old tag when it was changed */ - const Dict* tag; /* tag's dictionary definition */ + const Dict* was; /**< old tag when it was changed */ + const Dict* tag; /**< tag's dictionary definition */ - tmbstr element; /* name (NULL for text nodes) */ + tmbstr element; /**< name (NULL for text nodes) */ - uint start; /* start of span onto text array */ - uint end; /* end of span onto text array */ - NodeType type; /* TextNode, StartTag, EndTag etc. */ + uint start; /**< start of span onto text array */ + uint end; /**< end of span onto text array */ + NodeType type; /**< TextNode, StartTag, EndTag etc. */ - uint line; /* current line of document */ - uint column; /* current column of document */ + uint line; /**< current line of document */ + uint column; /**< current column of document */ - Bool closed; /* true if closed by explicit end tag */ - Bool implicit; /* true if inferred */ - Bool linebreak; /* true if followed by a line break */ + Bool closed; /**< true if closed by explicit end tag */ + Bool implicit; /**< true if inferred */ + Bool linebreak; /**< true if followed by a line break */ }; -/* - The following are private to the lexer - Use NewLexer() to create a lexer, and - FreeLexer() to free it. -*/ - +/** + * The following are private to the lexer. + * Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it. + */ struct _Lexer { - uint lines; /* lines seen */ - uint columns; /* at start of current token */ - Bool waswhite; /* used to collapse contiguous white space */ - Bool pushed; /* true after token has been pushed back */ - Bool insertspace; /* when space is moved after end tag */ - Bool excludeBlocks; /* Netscape compatibility */ - Bool exiled; /* true if moved out of table */ - Bool isvoyager; /* true if xmlns attribute on html element */ - uint versions; /* bit vector of HTML versions */ - uint doctype; /* version as given by doctype (if any) */ - uint versionEmitted; /* version of doctype emitted */ - Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ - uint txtstart; /* start of current node */ - uint txtend; /* end of current node */ - LexerState state; /* state of lexer's finite state machine */ + uint lines; /**< lines seen */ + uint columns; /**< at start of current token */ + Bool waswhite; /**< used to collapse contiguous white space */ + Bool pushed; /**< true after token has been pushed back */ + Bool insertspace; /**< when space is moved after end tag */ + Bool excludeBlocks; /**< Netscape compatibility */ + Bool exiled; /**< true if moved out of table */ + Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */ + uint versions; /**< bit vector of HTML versions */ + uint doctype; /**< version as given by doctype (if any) */ + uint versionEmitted; /**< version of doctype emitted */ + Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */ + uint txtstart; /**< start of current node */ + uint txtend; /**< end of current node */ + LexerState state; /**< state of lexer's finite state machine */ - Node* token; /* last token returned by GetToken() */ - Node* itoken; /* last duplicate inline returned by GetToken() */ - Node* root; /* remember root node of the document */ - Node* parent; /* remember parent node for CDATA elements */ - - Bool seenEndBody; /* true if a