diff --git a/src/clean.c b/src/clean.c index 5e2b936..e314ba6 100644 --- a/src/clean.c +++ b/src/clean.c @@ -1585,11 +1585,16 @@ void TY_(List2BQ)( TidyDocImpl* doc, Node* node ) */ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node ) { + Stack *stack = TY_(newStack)(doc, 16); + Node *next; + tmbchar indent_buf[ 32 ]; uint indent; while (node) { + next = node->next; + if ( nodeIsBLOCKQUOTE(node) && node->implicit ) { indent = 1; @@ -1602,19 +1607,27 @@ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node ) StripOnlyChild( doc, node ); } - if (node->content) - TY_(BQ2Div)( doc, node->content ); - TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem", 2*indent); RenameElem( doc, node, TidyTag_DIV ); TY_(AddStyleProperty)(doc, node, indent_buf ); + + if (node->content) + { + TY_(push)(stack, next); + node = node->content; + continue; + } } else if (node->content) - TY_(BQ2Div)( doc, node->content ); + { + TY_(push)(stack, next); + node = node->content; + continue; + } - node = node->next; + node = next ? next : TY_(pop)(stack); } } @@ -2736,30 +2749,42 @@ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId) */ static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent) { - Node *next; - while (node) - { - next = node->next; /* get 'next' now , in case the node is moved */ - /* dbg_show_node(doc, node, 0, indent); */ - if (nodeIsSTYLE(node)) - { - if (fix) - { - TY_(RemoveNode)(node); /* unhook style node from body */ - TY_(InsertNodeAtEnd)(head, node); /* add to end of head */ - TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */ - } - else - { - TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY); - } - } - else if (node->content) - { - StyleToHead(doc, head, node->content, fix, indent + 1); - } - node = next; /* process the 'next', if any */ - } + Stack *stack = TY_(newStack)(doc, 16); + Node *next; + + while (node) + { + next = node->next; + + if (nodeIsSTYLE(node)) + { + if (fix) + { + TY_(RemoveNode)(node); /* unhook style node from body */ + TY_(InsertNodeAtEnd)(head, node); /* add to end of head */ + TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */ + } + else + { + TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY); + } + } + else if (node->content) + { + TY_(push)(stack, next); + node = node->content; + indent++; + continue; + } + + if (next) + node = next; + else + { + node = TY_(pop)(stack); + indent--; + } + } } diff --git a/src/lexer.c b/src/lexer.c index 0fe5dd6..fa8d6fb 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str ) return 0; } -/* - node->type is one of these: - - #define TextNode 1 - #define StartTag 2 - #define EndTag 3 - #define StartEndTag 4 -*/ - Lexer* TY_(NewLexer)( TidyDocImpl* doc ) { Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) ); @@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node ) } } #endif - /* this is no good ;=(( - if (node && doc && doc->lexer) { - if (node == doc->lexer->token) { - doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer ); - } - } - ----------------- */ + while ( node ) { Node* next = node->next; @@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc) return NULL; } -/* - * local variables: - * mode: c - * indent-tabs-mode: nil - * c-basic-offset: 4 - * eval: (c-set-offset 'substatement-open 0) - * end: + +/****************************************************************************//* + ** MARK: - Node Stack + ***************************************************************************/ + + +/** + * Create a new stack with a given starting capacity. If memory allocation + * fails, then the allocator will panic the program automatically. */ +Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity) +{ + Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack)); + stack->top = -1; + stack->capacity = capacity; + stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**)); + stack->allocator = doc->allocator; + return stack; +} + + +/** + * Increase the stack size. This will be called automatically when the + * current stack is full. If memory allocation fails, then the allocator + * will panic the program automatically. + */ +void TY_(growStack)(Stack *stack) +{ + uint new_capacity = stack->capacity * 2; + + Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity); + + memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) ); + TidyFree(stack->allocator, stack->firstNode); + + stack->firstNode = firstNode; + stack->capacity = new_capacity; +} + + +/** + * Stack is full when top is equal to the last index. + */ +Bool TY_(stackFull)(Stack *stack) +{ + return stack->top == stack->capacity - 1; +} + + +/** + * Stack is empty when top is equal to -1 + */ +Bool TY_(stackEmpty)(Stack *stack) +{ + return stack->top == -1; +} + + +/** + * Push an item to the stack. + */ +void TY_(push)(Stack *stack, Node *node) +{ + if (TY_(stackFull)(stack)) + TY_(growStack)(stack); + + if (node) + stack->firstNode[++stack->top] = node; +} + + +/** + * Pop an item from the stack. + */ +Node* TY_(pop)(Stack *stack) +{ + return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; +} + + +/** + * Peek at the stack. + */ +FUNC_UNUSED Node* TY_(peek)(Stack *stack) +{ + return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; +} + +/** + * Frees the stack when done. + */ +void TY_(freeStack)(Stack *stack) +{ + TidyFree( stack->allocator, stack->firstNode ); + stack->top = -1; + stack->capacity = 0; + stack->firstNode = NULL; + stack->allocator = NULL; +} diff --git a/src/lexer.h b/src/lexer.h index 113a9f4..9d49898 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -1,33 +1,46 @@ #ifndef __LEXER_H__ #define __LEXER_H__ -/* lexer.h -- Lexer for html parser - - (c) 1998-2008 (W3C) MIT, ERCIM, Keio University - See tidy.h for the copyright notice. - Given an input source, it returns a sequence of tokens. - - GetToken(source) gets the next token - UngetToken(source) provides one level undo - - The tags include an attribute list: - - - linked list of attribute/value nodes - - each node has 2 NULL-terminated strings. - - entities are replaced in attribute values - - white space is compacted if not in preformatted mode - If not in preformatted mode then leading white space - is discarded and subsequent white space sequences - compacted to single space characters. - - If XmlTags is no then Tag names are folded to upper - case and attribute names to lower case. - - Not yet done: - - Doctype subset and marked sections -*/ +/**************************************************************************//** + * @file + * Lexer for HTML and XML Parsers. + * + * Given an input source, it returns a sequence of tokens. + * + * GetToken(source) gets the next token + * UngetToken(source) provides one level undo + * + * The tags include an attribute list: + * + * - linked list of attribute/value nodes + * - each node has 2 NULL-terminated strings. + * - entities are replaced in attribute values + * + * white space is compacted if not in preformatted mode + * If not in preformatted mode then leading white space + * is discarded and subsequent white space sequences + * compacted to single space characters. + * + * If XmlTags is no then Tag names are folded to upper + * case and attribute names to lower case. + * + * Not yet done: + * - Doctype subset and marked sections + * + * @author HTACG, et al (consult git log) + * + * @copyright + * (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG. + * See tidy.h for the copyright notice. + * @par + * All Rights Reserved. + * @par + * See `tidy.h` for the complete license. + * + * @date Additional updates: consult git log + * + ******************************************************************************/ #ifdef __cplusplus extern "C" { @@ -35,8 +48,23 @@ extern "C" { #include "forward.h" -/* lexer character types -*/ +/** @addtogroup internal_api */ +/** @{ */ + + +/***************************************************************************//** + ** @defgroup lexer_h HTML and XML Lexing + ** + ** These functions and structures form the internal API for document + ** lexing. + ** + ** @{ + ******************************************************************************/ + + +/** + * Lexer character types. + */ #define digit 1u #define letter 2u #define namechar 4u @@ -47,8 +75,9 @@ extern "C" { #define digithex 128u -/* node->type is one of these values -*/ +/** + * node->type is one of these values + */ typedef enum { RootNode, @@ -68,9 +97,9 @@ typedef enum } NodeType; - -/* lexer GetToken states -*/ +/** + * Lexer GetToken() states. + */ typedef enum { LEX_CONTENT, @@ -88,7 +117,10 @@ typedef enum LEX_XMLDECL } LexerState; -/* ParseDocTypeDecl state constants */ + +/** + * ParseDocTypeDecl state constants. + */ typedef enum { DT_INTERMEDIATE, @@ -98,67 +130,44 @@ typedef enum DT_INTSUBSET } ParseDocTypeDeclState; -/* content model shortcut encoding - Descriptions are tentative. -*/ +/** + * Content model shortcut encoding. + * Descriptions are tentative. + */ #define CM_UNKNOWN 0 -/* Elements with no content. Map to HTML specification. */ -#define CM_EMPTY (1 << 0) -/* Elements that appear outside of "BODY". */ -#define CM_HTML (1 << 1) -/* Elements that can appear within HEAD. */ -#define CM_HEAD (1 << 2) -/* HTML "block" elements. */ -#define CM_BLOCK (1 << 3) -/* HTML "inline" elements. */ -#define CM_INLINE (1 << 4) -/* Elements that mark list item ("LI"). */ -#define CM_LIST (1 << 5) -/* Elements that mark definition list item ("DL", "DT"). */ -#define CM_DEFLIST (1 << 6) -/* Elements that can appear inside TABLE. */ -#define CM_TABLE (1 << 7) -/* Used for "THEAD", "TFOOT" or "TBODY". */ -#define CM_ROWGRP (1 << 8) -/* Used for "TD", "TH" */ -#define CM_ROW (1 << 9) -/* Elements whose content must be protected against white space movement. - Includes some elements that can found in forms. */ -#define CM_FIELD (1 << 10) -/* Used to avoid propagating inline emphasis inside some elements - such as OBJECT or APPLET. */ -#define CM_OBJECT (1 << 11) -/* Elements that allows "PARAM". */ -#define CM_PARAM (1 << 12) -/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ -#define CM_FRAMES (1 << 13) -/* Heading elements (h1, h2, ...). */ -#define CM_HEADING (1 << 14) -/* Elements with an optional end tag. */ -#define CM_OPT (1 << 15) -/* Elements that use "align" attribute for vertical position. */ -#define CM_IMG (1 << 16) -/* Elements with inline and block model. Used to avoid calling InlineDup. */ -#define CM_MIXED (1 << 17) -/* Elements whose content needs to be indented only if containing one - CM_BLOCK element. */ -#define CM_NO_INDENT (1 << 18) -/* Elements that are obsolete (such as "dir", "menu"). */ -#define CM_OBSOLETE (1 << 19) -/* User defined elements. Used to determine how attributes without value - should be printed. */ -#define CM_NEW (1 << 20) -/* Elements that cannot be omitted. */ -#define CM_OMITST (1 << 21) +#define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */ +#define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */ +#define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */ +#define CM_BLOCK (1 << 3) /**< HTML "block" elements. */ +#define CM_INLINE (1 << 4) /**< HTML "inline" elements. */ +#define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */ +#define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */ +#define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */ +#define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */ +#define CM_ROW (1 << 9) /**< Used for "TD", "TH" */ +#define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */ +#define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */ +#define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */ +#define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ +#define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */ +#define CM_OPT (1 << 15) /**< Elements with an optional end tag. */ +#define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */ +#define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */ +#define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */ +#define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */ +#define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */ +#define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */ -/* If the document uses just HTML 2.0 tags and attributes described -** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. -** If there are proprietary tags and attributes then describe it as -** HTML Proprietary. If it includes the xml-lang or xmlns attributes -** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the -** flavors of Voyager (strict, loose or frameset). -*/ + +/** + * If the document uses just HTML 2.0 tags and attributes described + * it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. + * If there are proprietary tags and attributes then describe it as + * HTML Proprietary. If it includes the xml-lang or xmlns attributes + * but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the + * flavors of Voyager (strict, loose or frameset). + */ /* unknown */ #define xxxx 0u @@ -220,8 +229,10 @@ typedef enum /* all proprietary types */ #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) -/* Linked list of class names and styles -*/ + +/** + * Linked list of class names and styles + */ struct _Style; typedef struct _Style TagStyle; @@ -234,8 +245,9 @@ struct _Style }; -/* Linked list of style properties -*/ +/** + * Linked list of style properties + */ struct _StyleProp; typedef struct _StyleProp StyleProp; @@ -247,11 +259,9 @@ struct _StyleProp }; - - -/* Attribute/Value linked list node -*/ - +/** + * Attribute/Value linked list node + */ struct _AttVal { AttVal* next; @@ -264,93 +274,89 @@ struct _AttVal }; - -/* - Mosaic handles inlines via a separate stack from other elements - We duplicate this to recover from inline markup errors such as: - - italic text -

more italic text normal text - - which for compatibility with Mosaic is mapped to: - - italic text -

more italic text normal text - - Note that any inline end tag pop's the effect of the current - inline start tag, so that pop's in the above example. +/** + * Mosaic handles inlines via a separate stack from other elements + * We duplicate this to recover from inline markup errors such as: + * ~~~ + * italic text + *

more italic text normal text + * ~~~ + * which for compatibility with Mosaic is mapped to: + * ~~~ + * italic text + *

more italic text normal text + * ~~~ + * Note that any inline end tag pop's the effect of the current + * inline start tag, so that `` pop's `` in the above example. */ struct _IStack { IStack* next; - const Dict* tag; /* tag's dictionary definition */ - tmbstr element; /* name (NULL for text nodes) */ + const Dict* tag; /**< tag's dictionary definition */ + tmbstr element; /**< name (NULL for text nodes) */ AttVal* attributes; }; -/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, -** etc. etc. -*/ - +/** + * HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc. + */ struct _Node { - Node* parent; /* tree structure */ + Node* parent; /**< tree structure */ Node* prev; Node* next; Node* content; Node* last; AttVal* attributes; - const Dict* was; /* old tag when it was changed */ - const Dict* tag; /* tag's dictionary definition */ + const Dict* was; /**< old tag when it was changed */ + const Dict* tag; /**< tag's dictionary definition */ - tmbstr element; /* name (NULL for text nodes) */ + tmbstr element; /**< name (NULL for text nodes) */ - uint start; /* start of span onto text array */ - uint end; /* end of span onto text array */ - NodeType type; /* TextNode, StartTag, EndTag etc. */ + uint start; /**< start of span onto text array */ + uint end; /**< end of span onto text array */ + NodeType type; /**< TextNode, StartTag, EndTag etc. */ - uint line; /* current line of document */ - uint column; /* current column of document */ + uint line; /**< current line of document */ + uint column; /**< current column of document */ - Bool closed; /* true if closed by explicit end tag */ - Bool implicit; /* true if inferred */ - Bool linebreak; /* true if followed by a line break */ + Bool closed; /**< true if closed by explicit end tag */ + Bool implicit; /**< true if inferred */ + Bool linebreak; /**< true if followed by a line break */ }; -/* - The following are private to the lexer - Use NewLexer() to create a lexer, and - FreeLexer() to free it. -*/ - +/** + * The following are private to the lexer. + * Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it. + */ struct _Lexer { - uint lines; /* lines seen */ - uint columns; /* at start of current token */ - Bool waswhite; /* used to collapse contiguous white space */ - Bool pushed; /* true after token has been pushed back */ - Bool insertspace; /* when space is moved after end tag */ - Bool excludeBlocks; /* Netscape compatibility */ - Bool exiled; /* true if moved out of table */ - Bool isvoyager; /* true if xmlns attribute on html element */ - uint versions; /* bit vector of HTML versions */ - uint doctype; /* version as given by doctype (if any) */ - uint versionEmitted; /* version of doctype emitted */ - Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ - uint txtstart; /* start of current node */ - uint txtend; /* end of current node */ - LexerState state; /* state of lexer's finite state machine */ + uint lines; /**< lines seen */ + uint columns; /**< at start of current token */ + Bool waswhite; /**< used to collapse contiguous white space */ + Bool pushed; /**< true after token has been pushed back */ + Bool insertspace; /**< when space is moved after end tag */ + Bool excludeBlocks; /**< Netscape compatibility */ + Bool exiled; /**< true if moved out of table */ + Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */ + uint versions; /**< bit vector of HTML versions */ + uint doctype; /**< version as given by doctype (if any) */ + uint versionEmitted; /**< version of doctype emitted */ + Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */ + uint txtstart; /**< start of current node */ + uint txtend; /**< end of current node */ + LexerState state; /**< state of lexer's finite state machine */ - Node* token; /* last token returned by GetToken() */ - Node* itoken; /* last duplicate inline returned by GetToken() */ - Node* root; /* remember root node of the document */ - Node* parent; /* remember parent node for CDATA elements */ - - Bool seenEndBody; /* true if a tag has been encountered */ - Bool seenEndHtml; /* true if a tag has been encountered */ + Node* token; /**< last token returned by GetToken() */ + Node* itoken; /**< last duplicate inline returned by GetToken() */ + Node* root; /**< remember root node of the document */ + Node* parent; /**< remember parent node for CDATA elements */ + + Bool seenEndBody; /**< true if a `` tag has been encountered */ + Bool seenEndHtml; /**< true if a `` tag has been encountered */ /* Lexer character buffer @@ -361,33 +367,57 @@ struct _Lexer lexsize must be reset for each file. */ - tmbstr lexbuf; /* MB character buffer */ - uint lexlength; /* allocated */ - uint lexsize; /* used */ + tmbstr lexbuf; /**< MB character buffer */ + uint lexlength; /**< allocated */ + uint lexsize; /**< used */ /* Inline stack for compatibility with Mosaic */ - Node* inode; /* for deferring text node */ - IStack* insert; /* for inferring inline tags */ + Node* inode; /**< for deferring text node */ + IStack* insert; /**< for inferring inline tags */ IStack* istack; - uint istacklength; /* allocated */ - uint istacksize; /* used */ - uint istackbase; /* start of frame */ + uint istacklength; /**< allocated */ + uint istacksize; /**< used */ + uint istackbase; /**< start of frame */ - TagStyle *styles; /* used for cleaning up presentation markup */ + TagStyle *styles; /**< used for cleaning up presentation markup */ - TidyAllocator* allocator; /* allocator */ + TidyAllocator* allocator; /**< allocator */ }; -/* Lexer Functions -*/ +/** + * modes for GetToken() + * + * MixedContent -- for elements which don't accept PCDATA + * Preformatted -- white space preserved as is + * IgnoreMarkup -- for CDATA elements such as script, style + */ +typedef enum +{ + IgnoreWhitespace, + MixedContent, + Preformatted, + IgnoreMarkup, + OtherNamespace, + CdataContent +} GetTokenMode; -/* choose what version to use for new doctype */ + +/** @name Lexer Functions + * @{ + */ + + +/** + * Choose what version to use for new doctype + */ TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc ); -/* everything is allowed in proprietary version of HTML */ -/* this is handled here rather than in the tag/attr dicts */ +/** + * Everything is allowed in proprietary version of HTML. + * This is handled here rather than in the tag/attr dicts + */ TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers ); TY_PRIVATE Bool TY_(IsWhite)(uint c); @@ -399,7 +429,6 @@ TY_PRIVATE Bool TY_(IsNamechar)(uint c); TY_PRIVATE Bool TY_(IsXMLLetter)(uint c); TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c); -/* Bool IsLower(uint c); */ TY_PRIVATE Bool TY_(IsUpper)(uint c); TY_PRIVATE uint TY_(ToLower)(uint c); TY_PRIVATE uint TY_(ToUpper)(uint c); @@ -407,60 +436,82 @@ TY_PRIVATE uint TY_(ToUpper)(uint c); TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc ); TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc ); -/* store character c as UTF-8 encoded byte stream */ + +/** + * Store character c as UTF-8 encoded byte stream + */ TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c ); -/* - Used for elements and text nodes - element name is NULL for text nodes - start and end are offsets into lexbuf - which contains the textual content of - all elements in the parse tree. - parent and content allow traversal - of the parse tree in any direction. - attributes are represented as a linked - list of AttVal nodes which hold the - strings for attribute/value pairs. +/** + * Used for elements and text nodes. + * - Element name is NULL for text nodes. + * - start and end are offsets into lexbuf, + * which contains the textual content of + * all elements in the parse tree. + * - parent and content allow traversal + * of the parse tree in any direction. + * - attributes are represented as a linked + * list of AttVal nodes which hold the + * strings for attribute/value pairs. */ TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer ); -/* used to clone heading nodes when split by an


*/ +/** + * Used to clone heading nodes when split by an `
` + */ TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element ); -/* free node's attributes */ + +/** + * Free node's attributes + */ TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ); -/* doesn't repair attribute list linkage */ + +/** + * Doesn't repair attribute list linkage + */ TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av ); -/* detach attribute from node */ + +/** + * Detach attribute from node + */ TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr ); -/* detach attribute from node then free it -*/ + +/** + * Detach attribute from node then free it. + */ TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr ); -/* - Free document nodes by iterating through peers and recursing - through children. Set next to NULL before calling FreeNode() - to avoid freeing peer nodes. Doesn't patch up prev/next links. + +/** + * Free document nodes by iterating through peers and recursing + * through children. Set `next` to `NULL` before calling `FreeNode()` + * to avoid freeing peer nodes. Doesn't patch up prev/next links. */ TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node ); + TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer ); -/* used for creating preformatted text from Word2000 */ + +/** + * Used for creating preformatted text from Word2000. + */ TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer ); -/* used for adding a   for Word2000 */ + +/** + * Used for adding a   for Word2000. + */ TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt ); -TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ); -/* TY_PRIVATE void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */ -/* find element */ +TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ); TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc ); TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc ); TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc ); @@ -468,10 +519,16 @@ TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc); TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc ); TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc); -/* Returns containing block element, if any */ + +/** + * Returns containing block element, if any + */ TY_PRIVATE Node* TY_(FindContainer)( Node* node ); -/* add meta element for Tidy */ + +/** + * Add meta element for Tidy. + */ TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc ); TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc ); @@ -485,118 +542,209 @@ TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc ); TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc ); -/* fixup doctype if missing */ +/** + * Fixup doctype if missing. + */ TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc ); -/* ensure XML document starts with */ -/* add encoding attribute if not using ASCII or UTF-8 output */ + +/** + * Ensure XML document starts with ,and + * add encoding attribute if not using ASCII or UTF-8 output. + */ TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc ); + TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id); TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc ); - -/* - modes for GetToken() - - MixedContent -- for elements which don't accept PCDATA - Preformatted -- white space preserved as is - IgnoreMarkup -- for CDATA elements such as script, style -*/ -typedef enum -{ - IgnoreWhitespace, - MixedContent, - Preformatted, - IgnoreMarkup, - OtherNamespace, - CdataContent -} GetTokenMode; - TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode ); TY_PRIVATE void TY_(InitMap)(void); -/* create a new attribute */ +/** + * Create a new attribute. + */ TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc ); -/* create a new attribute with given name and value */ + +/** + * Create a new attribute with given name and value. + */ TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value, int delim ); -/* insert attribute at the end of attribute list of a node */ + +/** + * Insert attribute at the end of attribute list of a node. + */ TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av ); -/* insert attribute at the start of attribute list of a node */ +/** + * Insert attribute at the start of attribute list of a node. + */ TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av ); -/************************************* - In-line Stack functions -*************************************/ + +/** @} + * @name Inline Stack Functions + * @{ + */ -/* duplicate attributes */ +/** + * Duplicate attributes. + */ TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs ); -/* - push a copy of an inline node onto stack - but don't push if implicit or OBJECT or APPLET - (implicit tags are ones generated from the istack) - One issue arises with pushing inlines when - the tag is already pushed. For instance: - -

text -

more text - - Shouldn't be mapped to - -

text

-

more text -*/ +/** + * Push a copy of an inline node onto stack, but don't push if + * implicit or OBJECT or APPLET (implicit tags are ones generated + * from the istack). + * + * One issue arises with pushing inlines when the tag is already pushed. + * For instance: + * ~~~ + *

text + *

more text + * ~~~ + * Shouldn't be mapped to + * ~~~ + *

text

+ *

more text + * ~~~ + */ TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node ); -/* pop inline stack */ + +/** + * Pop inline stack. + */ TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node ); + TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node ); TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node ); -/* - This has the effect of inserting "missing" inline - elements around the contents of blocklevel elements - such as P, TD, TH, DIV, PRE etc. This procedure is - called at the start of ParseBlock. when the inline - stack is not empty, as will be the case in: -

italic heading

- - which is then treated as equivalent to - -

italic heading

- - This is implemented by setting the lexer into a mode - where it gets tokens from the inline stack rather than - from the input stream. -*/ +/** + * This has the effect of inserting "missing" inline elements around the + * contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This + * procedure is called at the start of `ParseBlock`, when the inline + * stack is not empty, as will be the case in: + * ~~~ + *

italic heading

+ * ~~~ + * which is then treated as equivalent to + * ~~~ + *

italic heading

+ * ~~~ + * This is implemented by setting the lexer into a mode where it gets + * tokens from the inline stack rather than from the input stream. + */ TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node ); -/* - defer duplicates when entering a table or other - element where the inlines shouldn't be duplicated -*/ + +/** + * Fefer duplicates when entering a table or other + * element where the inlines shouldn't be duplicated. + */ TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc ); + + TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc ); -/* stack manipulation for inline elements */ +/** + * Stack manipulation for inline elements + */ TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node ); + + TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element ); + +/** @} + * @name Generic stack of nodes. + * @{ + */ + + +/** + * This typedef represents a stack of addresses to nodes. Tidy uses these to + * try to limit recursion by pushing nodes to a stack when possible instead + * of recursing. + */ +typedef struct _Stack { + int top; /**< Current top position. */ + unsigned capacity; /**< Current capacity. Can be expanded. */ + Node **firstNode; /** A pointer to the first pointer to a Node in an array of node addresses. */ + TidyAllocator* allocator; /**< Tidy's allocator, used at instantiation and expanding. */ +} Stack; + + +/** + * Create a new stack with a given starting capacity. If memory allocation + * fails, then the allocator will panic the program automatically. + */ +TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity); + + +/** + * Increase the stack size. This will be called automatically when the + * current stack is full. If memory allocation fails, then the allocator + * will panic the program automatically. + */ +TY_PRIVATE void TY_(growStack)(Stack *stack); + + +/** + * Stack is full when top is equal to the last index. + */ +TY_PRIVATE Bool TY_(stackFull)(Stack *stack); + + +/** + * Stack is empty when top is equal to -1 + */ +TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack); + + +/** + * Push an item to the stack. + */ +TY_PRIVATE void TY_(push)(Stack *stack, Node *node); + + +/** + * Pop an item from the stack. + */ +TY_PRIVATE Node* TY_(pop)(Stack *stack); + + +/** + * Peek at the stack. + */ +TY_PRIVATE Node* TY_(peek)(Stack *stack); + +/** + * Frees the stack when done. + */ +TY_PRIVATE void TY_(freeStack)(Stack *stack); + + +/** @} + */ + + #ifdef __cplusplus } #endif +/** @} end parser_h group */ +/** @} end internal_api group */ + #endif /* __LEXER_H__ */ diff --git a/src/parser.c b/src/parser.c index eab7393..2a4fcb9 100644 --- a/src/parser.c +++ b/src/parser.c @@ -14,161 +14,37 @@ #include "tmbstr.h" #include "sprtf.h" -/* - Issue #72 - Need to know to avoid error-reporting - no warning only if --show-body-only yes - Issue #132 - likewise avoid warning if showing body only + +/****************************************************************************//* + ** MARK: - Forward Declarations + ***************************************************************************/ + + +static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode ); + + +/****************************************************************************//* + ** MARK: - Configuration Options + ***************************************************************************/ + + +/** + * Issue #72 - Need to know to avoid error-reporting - no warning only if + * --show-body-only yes. + * Issue #132 - Likewise avoid warning if showing body only. */ #define showingBodyOnly(doc) (cfgAutoBool(doc,TidyBodyOnly) == TidyYesState) ? yes : no -Bool TY_(CheckNodeIntegrity)(Node *node) -{ -#ifndef NO_NODE_INTEGRITY_CHECK - Node *child; +/****************************************************************************//* + ** MARK: - Node Operations + ***************************************************************************/ - if (node->prev) - { - if (node->prev->next != node) - return no; - } - if (node->next) - { - if (node->next == node || node->next->prev != node) - return no; - } - - if (node->parent) - { - if (node->prev == NULL && node->parent->content != node) - return no; - - if (node->next == NULL && node->parent->last != node) - return no; - } - - for (child = node->content; child; child = child->next) - if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) ) - return no; - -#endif - return yes; -} - -/* - used to determine how attributes - without values should be printed - this was introduced to deal with - user defined tags e.g. ColdFusion -*/ -Bool TY_(IsNewNode)(Node *node) -{ - if (node && node->tag) - { - return (node->tag->model & CM_NEW); - } - return yes; -} - -void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected) -{ - const Dict* tag = TY_(LookupTagDef)(tid); - Node* tmp = TY_(InferredTag)(doc, tag->id); - - if (obsolete) - TY_(Report)(doc, node, tmp, OBSOLETE_ELEMENT); - else if (unexpected) - TY_(Report)(doc, node, tmp, REPLACING_UNEX_ELEMENT); - else - TY_(Report)(doc, node, tmp, REPLACING_ELEMENT); - - TidyDocFree(doc, tmp->element); - TidyDocFree(doc, tmp); - - node->was = node->tag; - node->tag = tag; - node->type = StartTag; - node->implicit = yes; - TidyDocFree(doc, node->element); - node->element = TY_(tmbstrdup)(doc->allocator, tag->name); -} - -/* extract a node and its children from a markup tree */ -Node *TY_(RemoveNode)(Node *node) -{ - if (node->prev) - node->prev->next = node->next; - - if (node->next) - node->next->prev = node->prev; - - if (node->parent) - { - if (node->parent->content == node) - node->parent->content = node->next; - - if (node->parent->last == node) - node->parent->last = node->prev; - } - - node->parent = node->prev = node->next = NULL; - return node; -} - -/* remove node from markup tree and discard it */ -Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element ) -{ - Node *next = NULL; - - if (element) - { - next = element->next; - TY_(RemoveNode)(element); - TY_(FreeNode)( doc, element); - } - - return next; -} - -/* - insert "node" into markup tree as the first element - of content of "element" -*/ -void TY_(InsertNodeAtStart)(Node *element, Node *node) -{ - node->parent = element; - - if (element->content == NULL) - element->last = node; - else - element->content->prev = node; - - node->next = element->content; - node->prev = NULL; - element->content = node; -} - -/* - insert "node" into markup tree as the last element - of content of "element" -*/ -void TY_(InsertNodeAtEnd)(Node *element, Node *node) -{ - node->parent = element; - node->prev = element->last; - - if (element->last != NULL) - element->last->next = node; - else - element->content = node; - - element->last = node; -} - -/* - insert "node" into markup tree in place of "element" - which is moved to become the child of the node -*/ +/** + * Insert "node" into markup tree in place of "element" + * which is moved to become the child of the node + */ static void InsertNodeAsParent(Node *element, Node *node) { node->content = element; @@ -195,47 +71,172 @@ static void InsertNodeAsParent(Node *element, Node *node) node->next->prev = node; } -/* insert "node" into markup tree before "element" */ -void TY_(InsertNodeBeforeElement)(Node *element, Node *node) + +/** + * Inserts node into element at an appropriate location based + * on the type of node being inserted. + */ +static Bool InsertMisc(Node *element, Node *node) { - Node *parent; - - parent = element->parent; - node->parent = parent; - node->next = element; - node->prev = element->prev; - element->prev = node; - - if (node->prev) - node->prev->next = node; - - if (parent->content == element) - parent->content = node; -} - -/* insert "node" into markup tree after "element" */ -void TY_(InsertNodeAfterElement)(Node *element, Node *node) -{ - Node *parent; - - parent = element->parent; - node->parent = parent; - - /* AQ - 13 Jan 2000 fix for parent == NULL */ - if (parent != NULL && parent->last == element) - parent->last = node; - else + if (node->type == CommentTag || + node->type == ProcInsTag || + node->type == CDATATag || + node->type == SectionTag || + node->type == AspTag || + node->type == JsteTag || + node->type == PhpTag ) { - node->next = element->next; - /* AQ - 13 Jan 2000 fix for node->next == NULL */ - if (node->next != NULL) - node->next->prev = node; + TY_(InsertNodeAtEnd)(element, node); + return yes; } - element->next = node; - node->prev = element; + if ( node->type == XmlDecl ) + { + Node* root = element; + while ( root && root->parent ) + root = root->parent; + if ( root && !(root->content && root->content->type == XmlDecl)) + { + TY_(InsertNodeAtStart)( root, node ); + return yes; + } + } + + /* Declared empty tags seem to be slipping through + ** the cracks. This is an experiment to figure out + ** a decent place to pick them up. + */ + if ( node->tag && + TY_(nodeIsElement)(node) && + TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN && + (node->tag->versions & VERS_PROPRIETARY) != 0 ) + { + TY_(InsertNodeAtEnd)(element, node); + return yes; + } + + return no; } + +/** + * Move node to the head, where element is used as starting + * point in hunt for head. normally called during parsing. + */ +static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node ) +{ + Node *head; + + TY_(RemoveNode)( node ); /* make sure that node is isolated */ + + if ( TY_(nodeIsElement)(node) ) + { + TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN ); + + head = TY_(FindHEAD)(doc); + assert(head != NULL); + + TY_(InsertNodeAtEnd)(head, node); + + if ( node->tag->parser ) + ParseTag( doc, node, IgnoreWhitespace ); + } + else + { + TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); + TY_(FreeNode)( doc, node ); + } +} + + +/** + * Moves given node to end of body element. + */ +static void MoveNodeToBody( TidyDocImpl* doc, Node* node ) +{ + Node* body = TY_(FindBody)( doc ); + if ( body ) + { + TY_(RemoveNode)( node ); + TY_(InsertNodeAtEnd)( body, node ); + } +} + + +/** + * Unexpected content in table row is moved to just before the table in + * in accordance with Netscape and IE. This code assumes that node hasn't + * been inserted into the row. + */ +static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row, + Node *node ) +{ + Node *table; + + /* first find the table element */ + for (table = row->parent; table; table = table->parent) + { + if ( nodeIsTABLE(table) ) + { + TY_(InsertNodeBeforeElement)( table, node ); + return; + } + } + /* No table element */ + TY_(InsertNodeBeforeElement)( row->parent, node ); +} + + +/** + * Generalised search for duplicate elements. + * Issue #166 - repeated
element. + */ +static Bool findNodeWithId( Node *node, TidyTagId tid ) +{ + Node *content; + while (node) + { + if (TagIsId(node,tid)) + return yes; + /*\ + * Issue #459 - Under certain circumstances, with many node this use of + * 'for (content = node->content; content; content = content->content)' + * would produce a **forever** circle, or at least a very extended loop... + * It is sufficient to test the content, if it exists, + * to quickly iterate all nodes. Now all nodes are tested only once. + \*/ + content = node->content; + if (content) + { + if ( findNodeWithId(content,tid) ) + return yes; + } + node = node->next; + } + return no; +} + + +/** + * Perform a global search for an element. + * Issue #166 - repeated
element + */ +static Bool findNodeById( TidyDocImpl* doc, TidyTagId tid ) +{ + Node *node = (doc ? doc->root.content : NULL); + return findNodeWithId( node,tid ); +} + + +/***************************************************************************//* + ** MARK: - Decision Making + ***************************************************************************/ + + +/** + * Indicates whether or not element can be pruned based on content, + * user settings, etc. + */ static Bool CanPrune( TidyDocImpl* doc, Node *element ) { if ( !cfgBool(doc, TidyDropEmptyElems) ) @@ -321,150 +322,27 @@ static Bool CanPrune( TidyDocImpl* doc, Node *element ) return yes; } -/* return next element */ -Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element ) + +/** + * Indicates whether or not node is a descendant of a tag of the given tid. + */ +static Bool DescendantOf( Node *element, TidyTagId tid ) { - if ( CanPrune(doc, element) ) + Node *parent; + for ( parent = element->parent; + parent != NULL; + parent = parent->parent ) { - if (element->type != TextNode) - { - doc->footnotes |= FN_TRIM_EMPTY_ELEMENT; - TY_(Report)(doc, element, NULL, TRIM_EMPTY_ELEMENT); - } - - return TY_(DiscardElement)(doc, element); + if ( TagIsId(parent, tid) ) + return yes; } - return element->next; + return no; } -Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node) -{ - Node* next; - - while (node) - { - next = node->next; - - if (node->content) - TY_(DropEmptyElements)(doc, node->content); - - if (!TY_(nodeIsElement)(node) && - !(TY_(nodeIsText)(node) && !(node->start < node->end))) - { - node = next; - continue; - } - - next = TY_(TrimEmptyElement)(doc, node); - node = next; - } - - return node; -} - -/* - errors in positioning of form start or end tags - generally require human intervention to fix - Issue #166 - repeated
element also uses this flag - to indicate duplicates, discarded -*/ -static void BadForm( TidyDocImpl* doc ) -{ - doc->badForm |= flg_BadForm; - /* doc->errors++; */ -} - -/* - This maps - hello world - to - hello world - - If last child of element is a text node - then trim trailing white space character - moving it to after element's end tag. -*/ -static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last ) -{ - Lexer* lexer = doc->lexer; - byte c; - - if (TY_(nodeIsText)(last)) - { - if (last->end > last->start) - { - c = (byte) lexer->lexbuf[ last->end - 1 ]; - - if ( c == ' ' ) - { - last->end -= 1; - if ( (element->tag->model & CM_INLINE) && - !(element->tag->model & CM_FIELD) ) - lexer->insertspace = yes; - } - } - } -} - -/* Only true for text nodes. */ -Bool TY_(IsBlank)(Lexer *lexer, Node *node) -{ - Bool isBlank = TY_(nodeIsText)(node); - if ( isBlank ) - isBlank = ( node->end == node->start || /* Zero length */ - ( node->end == node->start+1 /* or one blank. */ - && lexer->lexbuf[node->start] == ' ' ) ); - return isBlank; -} - -/* - This maps -

hello world - to -

hello world - - Trims initial space, by moving it before the - start tag, or if this element is the first in - parent's content, then by discarding the space -*/ -static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text ) -{ - Lexer* lexer = doc->lexer; - Node *prev, *node; - - if ( TY_(nodeIsText)(text) && - lexer->lexbuf[text->start] == ' ' && - text->start < text->end ) - { - if ( (element->tag->model & CM_INLINE) && - !(element->tag->model & CM_FIELD) ) - { - prev = element->prev; - - if (TY_(nodeIsText)(prev)) - { - if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ') - lexer->lexbuf[(prev->end)++] = ' '; - - ++(element->start); - } - else /* create new node */ - { - node = TY_(NewNode)(lexer->allocator, lexer); - node->start = (element->start)++; - node->end = element->start; - lexer->lexbuf[node->start] = ' '; - TY_(InsertNodeBeforeElement)(element ,node); - DEBUG_LOG(SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n", - (element->element ? element->element : "unknown"))); - } - } - - /* discard the space in current node */ - ++(text->start); - } -} +/** + * Indicates whether or not node is a descendant of a pre tag. + */ static Bool IsPreDescendant(Node* node) { Node *parent = node->parent; @@ -480,6 +358,10 @@ static Bool IsPreDescendant(Node* node) return no; } + +/** + * Indicates whether or not trailing whitespace should be cleaned. + */ static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) { Node* next; @@ -531,6 +413,10 @@ static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) return no; } + +/** + * Indicates whether or not leading whitespace should be cleaned. + */ static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node) { if (!TY_(nodeIsText)(node)) @@ -565,10 +451,149 @@ static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node) return no; } + +/** + * Indicates whether or not the content of the given node is acceptable + * content for pre elements + */ +static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node ) +{ + /* p is coerced to br's, Text OK too */ + if ( nodeIsP(node) || TY_(nodeIsText)(node) ) + return yes; + + if ( node->tag == NULL || + nodeIsPARAM(node) || + !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) ) + return no; + + return yes; +} + + +/** + * Indicates whether or not the only content model for the given node + * is CM_INLINE. + */ +static Bool nodeCMIsOnlyInline( Node* node ) +{ + return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK ); +} + + +/***************************************************************************//* + ** MARK: - Information Accumulation + ***************************************************************************/ + + +/** + * Errors in positioning of form start or end tags + * generally require human intervention to fix. + * Issue #166 - repeated

element also uses this flag + * to indicate duplicates, discarded. + */ +static void BadForm( TidyDocImpl* doc ) +{ + doc->badForm |= flg_BadForm; +} + + +/***************************************************************************//* + ** MARK: - Fixes and Touchup + ***************************************************************************/ + + +/** + * This maps + * hello world + * to + * hello world + * + * If last child of element is a text node + * then trim trailing white space character + * moving it to after element's end tag. + */ +static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last ) +{ + Lexer* lexer = doc->lexer; + byte c; + + if (TY_(nodeIsText)(last)) + { + if (last->end > last->start) + { + c = (byte) lexer->lexbuf[ last->end - 1 ]; + + if ( c == ' ' ) + { + last->end -= 1; + if ( (element->tag->model & CM_INLINE) && + !(element->tag->model & CM_FIELD) ) + lexer->insertspace = yes; + } + } + } +} + + +/** + * This maps + *

hello world + * to + *

hello world + * + * Trims initial space, by moving it before the + * start tag, or if this element is the first in + * parent's content, then by discarding the space + */ +static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text ) +{ + Lexer* lexer = doc->lexer; + Node *prev, *node; + + if ( TY_(nodeIsText)(text) && + lexer->lexbuf[text->start] == ' ' && + text->start < text->end ) + { + if ( (element->tag->model & CM_INLINE) && + !(element->tag->model & CM_FIELD) ) + { + prev = element->prev; + + if (TY_(nodeIsText)(prev)) + { + if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ') + lexer->lexbuf[(prev->end)++] = ' '; + + ++(element->start); + } + else /* create new node */ + { + node = TY_(NewNode)(lexer->allocator, lexer); + node->start = (element->start)++; + node->end = element->start; + lexer->lexbuf[node->start] = ' '; + TY_(InsertNodeBeforeElement)(element ,node); + DEBUG_LOG(SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n", + (element->element ? element->element : "unknown"))); + } + } + + /* discard the space in current node */ + ++(text->start); + } +} + + +/** + * Cleans whitespace from text nodes, and drops such nodes if emptied + * completely as a result. + */ static void CleanSpaces(TidyDocImpl* doc, Node* node) { - Node* next; - + Stack *stack = TY_(newStack)(doc, 16); + Node *next; + while (node) { next = node->next; @@ -585,30 +610,34 @@ static void CleanSpaces(TidyDocImpl* doc, Node* node) { TY_(RemoveNode)(node); TY_(FreeNode)(doc, node); - node = next; - + node = next ? next : TY_(pop)(stack); continue; } if (node->content) - CleanSpaces(doc, node->content); + { + TY_(push)(stack, next); + node = node->content; + continue; + } - node = next; + node = next ? next : TY_(pop)(stack); } + TY_(freeStack)(stack); } -/* - Move initial and trailing space out. - This routine maps: - hello world - to - hello world - and - hello world - to - hello world -*/ +/** + * Move initial and trailing space out. + * This routine maps: + * hello world + * to + * hello world + * and + * hello world + * to + * hello world + */ static void TrimSpaces( TidyDocImpl* doc, Node *element) { Node* text = element->content; @@ -625,97 +654,11 @@ static void TrimSpaces( TidyDocImpl* doc, Node *element) TrimTrailingSpace(doc, element, text); } -static Bool DescendantOf( Node *element, TidyTagId tid ) -{ - Node *parent; - for ( parent = element->parent; - parent != NULL; - parent = parent->parent ) - { - if ( TagIsId(parent, tid) ) - return yes; - } - return no; -} -static Bool InsertMisc(Node *element, Node *node) -{ - if (node->type == CommentTag || - node->type == ProcInsTag || - node->type == CDATATag || - node->type == SectionTag || - node->type == AspTag || - node->type == JsteTag || - node->type == PhpTag ) - { - TY_(InsertNodeAtEnd)(element, node); - return yes; - } - - if ( node->type == XmlDecl ) - { - Node* root = element; - while ( root && root->parent ) - root = root->parent; - if ( root && !(root->content && root->content->type == XmlDecl)) - { - TY_(InsertNodeAtStart)( root, node ); - return yes; - } - } - - /* Declared empty tags seem to be slipping through - ** the cracks. This is an experiment to figure out - ** a decent place to pick them up. - */ - if ( node->tag && - TY_(nodeIsElement)(node) && - TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN && - (node->tag->versions & VERS_PROPRIETARY) != 0 ) - { - TY_(InsertNodeAtEnd)(element, node); - return yes; - } - - return no; -} - - -static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode ) -{ - Lexer* lexer = doc->lexer; - - if (node->tag == NULL) /* [i_a]2 prevent crash for active content (php, asp) docs */ - return; - - /* - Fix by GLP 2000-12-21. Need to reset insertspace if this - is both a non-inline and empty tag (base, link, meta, isindex, hr, area). - */ - if (node->tag->model & CM_EMPTY) - { - lexer->waswhite = no; - if (node->tag->parser == NULL) - return; - } - else if (!(node->tag->model & CM_INLINE)) - lexer->insertspace = no; - - if (node->tag->parser == NULL) - return; - - if (node->type == StartEndTag) - return; - - lexer->parent = node; /* [i_a]2 added this - not sure why - CHECKME: */ - - (*node->tag->parser)( doc, node, mode ); -} - -/* - the doctype has been found after other tags, - and needs moving to before the html element -*/ +/** + * The doctype has been found after other tags, + * and needs moving to before the html element + */ static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype ) { Node* existing = TY_(FindDocType)( doc ); @@ -733,51 +676,17 @@ static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype ) } } -/* - move node to the head, where element is used as starting - point in hunt for head. normally called during parsing -*/ -static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node ) -{ - Node *head; - TY_(RemoveNode)( node ); /* make sure that node is isolated */ - - if ( TY_(nodeIsElement)(node) ) - { - TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN ); - - head = TY_(FindHEAD)(doc); - assert(head != NULL); - - TY_(InsertNodeAtEnd)(head, node); - - if ( node->tag->parser ) - ParseTag( doc, node, IgnoreWhitespace ); - } - else - { - TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); - TY_(FreeNode)( doc, node ); - } -} - -/* moves given node to end of body element */ -static void MoveNodeToBody( TidyDocImpl* doc, Node* node ) -{ - Node* body = TY_(FindBody)( doc ); - if ( body ) - { - TY_(RemoveNode)( node ); - TY_(InsertNodeAtEnd)( body, node ); - } -} +/** + * Adds style information as a class in the document or a property + * of the node to prevent indentation of inferred UL tags. + */ static void AddClassNoIndent( TidyDocImpl* doc, Node *node ) { ctmbstr sprop = - "padding-left: 2ex; margin-left: 0ex" - "; margin-top: 0ex; margin-bottom: 0ex"; + "padding-left: 2ex; margin-left: 0ex" + "; margin-top: 0ex; margin-bottom: 0ex"; if ( !cfgBool(doc, TidyDecorateInferredUL) ) return; if ( cfgBool(doc, TidyMakeClean) ) @@ -786,12 +695,378 @@ static void AddClassNoIndent( TidyDocImpl* doc, Node *node ) TY_(AddStyleProperty)( doc, node, sprop ); } -/* - element is node created by the lexer - upon seeing the start tag, or by the - parser when the start tag is inferred -*/ -void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) + +/** + * If a table row is empty then insert an empty cell. This practice is + * consistent with browser behavior and avoids potential problems with + * row spanning cells. + */ +static void FixEmptyRow(TidyDocImpl* doc, Node *row) +{ + Node *cell; + + if (row->content == NULL) + { + cell = TY_(InferredTag)(doc, TidyTag_TD); + TY_(InsertNodeAtEnd)(row, cell); + TY_(Report)(doc, row, cell, MISSING_STARTTAG); + } +} + + +/***************************************************************************//* + ** MARK: - Parsers Support + ***************************************************************************/ + + +/** + * Structure used by FindDescendant_cb. + */ +struct MatchingDescendantData +{ + Node *found_node; + Bool *passed_marker_node; + + /* input: */ + TidyTagId matching_tagId; + Node *node_to_find; + Node *marker_node; +}; + + +/** + * The main engine for FindMatchingDescendant. + */ +static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate) +{ + struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate; + + if (TagId(node) == cb_data->matching_tagId) + { + /* make sure we match up 'unknown' tags exactly! */ + if (cb_data->matching_tagId != TidyTag_UNKNOWN || + (node->element != NULL && + cb_data->node_to_find != NULL && + cb_data->node_to_find->element != NULL && + 0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element))) + { + cb_data->found_node = node; + return ExitTraversal; + } + } + + if (cb_data->passed_marker_node && node == cb_data->marker_node) + *cb_data->passed_marker_node = yes; + + return VisitParent; +} + + +/** + * Search the parent chain (from `parent` upwards up to the root) for a node + * matching the given 'node'. + * + * When the search passes beyond the `marker_node` (which is assumed to sit + * in the parent chain), this will be flagged by setting the boolean + * referenced by `is_parent_of_marker` to `yes`. + * + * 'is_parent_of_marker' and 'marker_node' are optional parameters and may + * be NULL. + */ +static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker ) +{ + struct MatchingDescendantData cb_data = { 0 }; + cb_data.matching_tagId = TagId(node); + cb_data.node_to_find = node; + cb_data.marker_node = marker_node; + + assert(node); + + if (is_parent_of_marker) + *is_parent_of_marker = no; + + TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data); + return cb_data.found_node; +} + + +/** + * Finds the last list item for the given list, providing it in the + * in-out parameter. Returns yes or no if the item was the last list + * item. + */ +static Bool FindLastLI( Node *list, Node **lastli ) +{ + Node *node; + + *lastli = NULL; + for ( node = list->content; node ; node = node->next ) + if ( nodeIsLI(node) && node->type == StartTag ) + *lastli=node; + return *lastli ? yes:no; +} + + +/***************************************************************************//* + ** MARK: - Parser Stack + ***************************************************************************/ + + +/** + * Allocates and initializes the parser's stack. + */ +void TY_(InitParserStack)( TidyDocImpl* doc ) +{ + uint default_size = 16; + TidyParserMemory *content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * default_size ); + + doc->stack.content = content; + doc->stack.size = default_size; + doc->stack.top = -1; + doc->stack.allocator = doc->allocator; +} + + +/** + * Frees the parser's stack when done. + */ +void TY_(FreeParserStack)( TidyDocImpl* doc ) +{ + TidyFree( doc->stack.allocator, doc->stack.content ); + + doc->stack.content = NULL; + doc->stack.size = 0; + doc->stack.top = -1; +} + + +/** + * Increase the stack size. + * TODO: don't overflow max_uint. Need a message when we can no longer increase the size beyond 429 million depth. + */ +static void growParserStack( TidyDocImpl* doc ) +{ + TidyParserMemory *content; + content = (TidyParserMemory *) TidyAlloc( doc->stack.allocator, sizeof(TidyParserMemory) * doc->stack.size * 2 ); + + memcpy( content, doc->stack.content, sizeof(TidyParserMemory) * (doc->stack.top + 1) ); + TidyFree(doc->stack.allocator, doc->stack.content); + + doc->stack.content = content; + doc->stack.size = doc->stack.size * 2; +} + + +/** + * Indicates whether or not the stack is empty. + */ +static Bool isEmptyParserStack( TidyDocImpl* doc ) +{ + return doc->stack.top < 0; +} + + +/** + * Push the parser memory to the stack. + */ +static void pushMemory( TidyDocImpl* doc, TidyParserMemory data ) +{ + if ( doc->stack.top == doc->stack.size - 1 ) + growParserStack( doc ); + + doc->stack.top++; + doc->stack.content[doc->stack.top] = data; +} + + +/** + * Peek at the parser memory. + */ +static FUNC_UNUSED TidyParserMemory peekMemory( TidyDocImpl* doc ) +{ + return doc->stack.content[doc->stack.top]; +} + + +/** + * Peek at the parser memory "mode" field. This is just a convenience + * to avoid having to create a new struct instance in the caller. + */ +static GetTokenMode peekMemoryMode( TidyDocImpl* doc ) +{ + return doc->stack.content[doc->stack.top].mode; +} + + +/** + * Peek at the parser memory "identity" field. This is just a convenience + * to avoid having to create a new struct instance in the caller. + */ +static Parser* peekMemoryIdentity( TidyDocImpl* doc ) +{ + return doc->stack.content[doc->stack.top].identity; +} + + +/** + * Pop out a parser memory. + */ +static TidyParserMemory popMemory( TidyDocImpl* doc ) +{ + if ( !isEmptyParserStack( doc ) ) + { + TidyParserMemory data = doc->stack.content[doc->stack.top]; + doc->stack.top = doc->stack.top - 1; + return data; + } + TidyParserMemory blank = { NULL }; + return blank; +} + + +/***************************************************************************//* + ** MARK: - Parser Search and Instantiation + ***************************************************************************/ + + +/** + * Retrieves the correct parser for the given node, accounting for various + * conditions, and readies the lexer for parsing that node. + */ +static Parser* GetParserForNode( TidyDocImpl* doc, Node *node ) +{ + Lexer* lexer = doc->lexer; + + /* [i_a]2 prevent crash for active content (php, asp) docs */ + if (node->tag == NULL) + return NULL; + + /* + Fix by GLP 2000-12-21. Need to reset insertspace if this is both + a non-inline and empty tag (base, link, meta, isindex, hr, area). + */ + if (node->tag->model & CM_EMPTY) + { + lexer->waswhite = no; + if (node->tag->parser == NULL) + return NULL; + } + else if (!(node->tag->model & CM_INLINE)) + lexer->insertspace = no; + + if (node->tag->parser == NULL) + return NULL; + + if (node->type == StartEndTag) + return NULL; + + /* [i_a]2 added this - not sure why - CHECKME: */ + lexer->parent = node; + + return (node->tag->parser); +} + + +/** + * Instantiates the correct parser for the given node. This is currently + * maintained ONLY until the legacy parsers have been ported, as this + * introduces recursion when used. + */ +static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode ) +{ + Parser* parser = GetParserForNode( doc, node ); + + if ( parser ) + (*parser)( doc, node, mode, no ); +} + + +/** + * The main parser body will populate the document's document root starting + * with the provided node, which generally should be the HTML node after the + * pre-HTML stuff is handled at a higher level. + * + * This parser works cooperatively with compliant parsers to pass state + * information back and forth in the TidyDocImpl's `stack`, which resides on + * the heap and prevents recursion and stack exhaustion, and also works well + * with the old-style parsers that do recurse. + * + * (The goal is to update the old-style parsers slowly and deliberately + * without causing regressions, in a series of smaller commits and updates.) + */ +void ParseHTMLWithNode( TidyDocImpl* doc, Node* node ) +{ + GetTokenMode mode = IgnoreWhitespace; + Parser* parser = NULL; + + /* + This main loop is only extinguished when all of the parser tokens are + consumed. Note that most of the parsers consume tokens as well, and + so what we're really doing here is managing parsers and preventing + recursion with cooperating parsers. + */ + while ( node ) + { + if ( (parser = GetParserForNode( doc, node )) ) + { + if ( (node = parser( doc, node, mode, no )) ) + { + /* + When a parser returns a node, it means that we have + to continue the loop rather than moving on, because it + indicates that the parser encountered a token it does not + handle. It also tells us the correct GetTokenMode to use + for it via the struct that it pushed: + */ + mode = peekMemoryMode( doc ); + continue; + } + } + + /* + If we've come this far, the parser has bottomed out, and won't be + going any deeper. Now we run back up the stack to close all of the + open elements and handle any parser post-processing that was needed. + Of course, other nodes might cause us to deepen the stack again, too. + */ + if ( !isEmptyParserStack( doc ) ) + { + if ( (parser = peekMemoryIdentity( doc )) ) + { + if ( (node = parser( doc, NULL, 0, yes )) ) + { + /* Another assignment from the parser. */ + mode = peekMemoryMode( doc ); + continue; + } + } else { + /* + There's no identity in the stack (it was used to pass back + a GetToken mode, and nothing else, so remove discard it. + */ + popMemory( doc ); + } + } + + /* + Assuming we've gotten this far, there's no more work to do and + so we can draw a nice, fresh token from the lexer. + */ + node = TY_(GetToken)( doc, mode ); + } +} + + +/***************************************************************************//* + ** MARK: - Old Parsers + ***************************************************************************/ + + +/** MARK: TY_(oldParseBlock) + * `element` is a node created by the lexer upon seeing the start tag, or + * by the parser when the start tag is inferred + */ +void* TY_(oldParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) { #if defined(ENABLE_DEBUG_LOG) static int in_parse_block = 0; @@ -813,10 +1088,10 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlockL 1 %d...\n",in_parse_block); #endif - return; + return NULL; } - if ( nodeIsFORM(element) && + if ( nodeIsFORM(element) && DescendantOf(element, TidyTag_FORM) ) TY_(Report)(doc, element, NULL, ILLEGAL_NESTING ); @@ -851,8 +1126,8 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) else if (mode == IgnoreWhitespace) { /* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace' - when such a leading space may need to be inserted before this element to - preserve the browser view */ + when such a leading space may need to be inserted before this element to + preverve the browser view */ mode = MixedContent; } @@ -878,7 +1153,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block); #endif - return; + return NULL; } if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) ) @@ -913,7 +1188,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) } else if (DescendantOf( element, node->tag->id )) { - /* + /* if this is the end tag for an ancestor element then infer end tag for this element */ @@ -932,7 +1207,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block); #endif - return; + return NULL; } } } @@ -1096,7 +1371,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlock 3 %d...\n",in_parse_block); #endif - return; + return NULL; } } else if ( TY_(nodeHasCM)(node, CM_BLOCK) ) @@ -1116,7 +1391,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlock 4 %d...\n",in_parse_block); #endif - return; + return NULL; } } else /* things like list items */ @@ -1157,7 +1432,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit ) TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); - + /* #521, warn on missing optional end-tags if not omitting them. */ if ( cfgBool( doc, TidyOmitOptionalTags ) == no && TY_(nodeHasCM)(element, CM_OPT) ) TY_(Report)(doc, element, node, MISSING_ENDTAG_OPTIONAL ); @@ -1175,7 +1450,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlock 5 %d...\n",in_parse_block); #endif - return; + return NULL; } node = TY_(InferredTag)(doc, TidyTag_UL); @@ -1190,7 +1465,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlock 6 %d...\n",in_parse_block); #endif - return; + return NULL; } node = TY_(InferredTag)(doc, TidyTag_DL); @@ -1198,14 +1473,14 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) ) { /* http://tidy.sf.net/issue/1316307 */ - /* In exiled mode, return so table processing can + /* In exiled mode, return so table processing can continue. */ if (lexer->exiled) { #if defined(ENABLE_DEBUG_LOG) in_parse_block--; SPRTF("Exit ParseBlock 7 %d...\n",in_parse_block); #endif - return; + return NULL; } node = TY_(InferredTag)(doc, TidyTag_TABLE); } @@ -1220,7 +1495,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlock 8 %d...\n",in_parse_block); #endif - return; + return NULL; } else @@ -1230,7 +1505,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlock 9 %d...\n",in_parse_block); #endif - return; + return NULL; } } } @@ -1242,7 +1517,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) * href: http://www.w3.org/TR/html-markup/a.html * The interactive element a must not appear as a descendant of the a element. \*/ - if ( nodeIsA(node) && !node->implicit && + if ( nodeIsA(node) && !node->implicit && (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) { if (node->type != EndTag && node->attributes == NULL @@ -1274,7 +1549,7 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) in_parse_block--; SPRTF("Exit ParseBlock 9b %d...\n",in_parse_block); #endif - return; + return NULL; } /* parse known element */ @@ -1306,11 +1581,11 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) TrimSpaces( doc, element ); TY_(InsertNodeAtEnd)(element, node); - + if (node->implicit) TY_(Report)(doc, element, node, INSERTING_TAG ); - /* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an + /* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an effort has been made above to set a 'MixedContent' mode in some cases? WHY IS THE 'mode' VARIABLE NOT USED HERE???? */ ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ ); @@ -1338,205 +1613,19 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) } TrimSpaces( doc, element ); + #if defined(ENABLE_DEBUG_LOG) in_parse_block--; SPRTF("Exit ParseBlock 10 %d...\n",in_parse_block); #endif -} - -/* [i_a] svg / math */ - -struct MatchingDescendantData -{ - Node *found_node; - Bool *passed_marker_node; - - /* input: */ - TidyTagId matching_tagId; - Node *node_to_find; - Node *marker_node; -}; - -static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate) -{ - struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate; - - if (TagId(node) == cb_data->matching_tagId) - { - /* make sure we match up 'unknown' tags exactly! */ - if (cb_data->matching_tagId != TidyTag_UNKNOWN || - (node->element != NULL && - cb_data->node_to_find != NULL && - cb_data->node_to_find->element != NULL && - 0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element))) - { - cb_data->found_node = node; - return ExitTraversal; - } - } - - if (cb_data->passed_marker_node && node == cb_data->marker_node) - *cb_data->passed_marker_node = yes; - - return VisitParent; -} - -/* -Search the parent chain (from 'parent' upwards up to the root) for a node matching the -given 'node'. - -When the search passes beyond the 'marker_node' (which is assumed to sit in the -parent chain), this will be flagged by setting the boolean referenced by -'is_parent_of_marker' to yes. - -'is_parent_of_marker' and 'marker_node' are optional parameters and may be NULL. -*/ -static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker ) -{ - struct MatchingDescendantData cb_data = { 0 }; - cb_data.matching_tagId = TagId(node); - cb_data.node_to_find = node; - cb_data.marker_node = marker_node; - - assert(node); - - if (is_parent_of_marker) - *is_parent_of_marker = no; - - TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data); - return cb_data.found_node; -} - -/* - Act as a generic XML (sub)tree parser: collect each node and add it to the DOM, without any further validation. - TODO : add schema- or other-hierarchy-definition-based validation of the subtree here... -*/ -void TY_(ParseNamespace)(TidyDocImpl* doc, Node *basenode, GetTokenMode mode) -{ - Lexer* lexer = doc->lexer; - Node *node; - Node *parent = basenode; - uint istackbase; - AttVal* av; /* #130 MathML attr and entity fix! */ - - /* a la : defer popping elements off the inline stack */ - TY_(DeferDup)( doc ); - istackbase = lexer->istackbase; - lexer->istackbase = lexer->istacksize; - - mode = OtherNamespace; /* Preformatted; IgnoreWhitespace; */ - - while ((node = TY_(GetToken)(doc, mode)) != NULL) - { - /* - fix check to skip action in InsertMisc for regular/empty - nodes, which we don't want here... - - The way we do it here is by checking and processing everything - and only what remains goes into InsertMisc() - */ - - /* is this a close tag? And does it match the current parent node? */ - if (node->type == EndTag) - { - /* - to prevent end tags flowing from one 'alternate namespace' we - check this in two phases: first we check if the tag is a - descendant of the current node, and when it is, we check whether - it is the end tag for a node /within/ or /outside/ the basenode. - */ - Bool outside; - Node *mp = FindMatchingDescendant(parent, node, basenode, &outside); - - if (mp != NULL) - { - /* - when mp != parent as we might expect, - infer end tags until we 'hit' the matched - parent or the basenode - */ - Node *n; - - for (n = parent; - n != NULL && n != basenode->parent && n != mp; - n = n->parent) - { - /* n->implicit = yes; */ - n->closed = yes; - TY_(Report)(doc, n->parent, n, MISSING_ENDTAG_BEFORE); - } - - /* Issue #369 - Since 'assert' is DEBUG only, and there are - simple cases where these can be fired, removing them - pending feedback from the original author! - assert(outside == no ? n == mp : 1); - assert(outside == yes ? n == basenode->parent : 1); - =================================================== */ - - if (outside == no) - { - /* EndTag for a node within the basenode subtree. Roll on... */ - n->closed = yes; - TY_(FreeNode)(doc, node); - - node = n; - parent = node->parent; - } - else - { - /* EndTag for a node outside the basenode subtree: let the caller handle that. */ - TY_(UngetToken)( doc ); - node = basenode; - parent = node->parent; - } - - /* when we've arrived at the end-node for the base node, it's quitting time */ - if (node == basenode) - { - lexer->istackbase = istackbase; - assert(basenode->closed == yes); - return; - } - } - else - { - /* unmatched close tag: report an error and discard */ - /* TY_(Report)(doc, parent, node, NON_MATCHING_ENDTAG); Issue #308 - Seems wrong warning! */ - TY_(Report)(doc, parent, node, DISCARDING_UNEXPECTED); - assert(parent); - /* assert(parent->tag != node->tag); Issue #308 - Seems would always be true! */ - TY_(FreeNode)( doc, node); /* Issue #308 - Discard unexpected end tag memory */ - } - } - else if (node->type == StartTag) - { - /* #130 MathML attr and entity fix! - care if it has attributes, and 'accidently' any of those attributes match known */ - for ( av = node->attributes; av; av = av->next ) - { - av->dict = 0; /* does something need to be freed? */ - } - /* add another child to the current parent */ - TY_(InsertNodeAtEnd)(parent, node); - parent = node; - } - else - { - /* #130 MathML attr and entity fix! - care if it has attributes, and 'accidently' any of those attributes match known */ - for ( av = node->attributes; av; av = av->next ) - { - av->dict = 0; /* does something need to be freed? */ - } - TY_(InsertNodeAtEnd)(parent, node); - } - } - - TY_(Report)(doc, basenode->parent, basenode, MISSING_ENDTAG_FOR); + return NULL; } -TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode ) +/** MARK: TY_(oldParseInline) + * Parse inline element nodes. + */ +void* TY_(oldParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode ) { #if defined(ENABLE_DEBUG_LOG) static int in_parse_inline = 0; @@ -1553,7 +1642,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 1 %d...\n",in_parse_inline); #endif - return; + return NULL; } /* @@ -1564,7 +1653,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode This test is carried out in PushInline and PopInline, see istack.c InlineDup(...) is not called for elements with a CM_MIXED (inline and - block) content model, e.g. or , otherwise constructs like + block) content model, e.g. or , otherwise constructs like

111222333444555

111222333444555

@@ -1605,7 +1694,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode then move the font element inside the anchor since otherwise it won't alter the anchor text color */ - if ( nodeIsFONT(element) && + if ( nodeIsFONT(element) && element->content && element->content == element->last ) { Node *child = element->content; @@ -1635,7 +1724,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 2 %d...\n",in_parse_inline); #endif - return; + return NULL; } /* ... map 2nd to if 1st is explicit */ @@ -1674,14 +1763,14 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode if (node->attributes == NULL || element->attributes == NULL) TY_(Report)(doc, element, node, NESTED_EMPHASIS); } - else if ( TY_(IsPushed)(doc, node) && node->type == StartTag && + else if ( TY_(IsPushed)(doc, node) && node->type == StartTag && nodeIsQ(node) ) { /*\ * Issue #215 - such nested quotes are NOT a problem if HTML5, so * only issue this warning if NOT HTML5 mode. \*/ - if (TY_(HTMLVersion)(doc) != HT50) + if (TY_(HTMLVersion)(doc) != HT50) { TY_(Report)(doc, element, node, NESTED_QUOTATION); } @@ -1726,14 +1815,14 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 3 %d...\n",in_parse_inline); #endif - return; + return NULL; } /* within
or
 map 

to
*/ if ( nodeIsP(node) && node->type == StartTag && ( (mode & Preformatted) || - nodeIsDT(element) || + nodeIsDT(element) || DescendantOf(element, TidyTag_DT ) ) ) @@ -1753,7 +1842,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode { TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); TY_(InsertNodeAtEnd)(element, node); - (*node->tag->parser)( doc, node, mode ); + (*node->tag->parser)( doc, node, mode, no ); continue; } @@ -1816,7 +1905,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 4 %d...\n",in_parse_inline); #endif - return; /* close , but will re-open it, after */ + return NULL; /* close , but will re-open it, after */ } } TY_(PopInline)( doc, element ); @@ -1840,7 +1929,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 5 %d...\n",in_parse_inline); #endif - return; + return NULL; } /* if parent is then discard unexpected inline end tag */ @@ -1857,7 +1946,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 6 %d...\n",in_parse_inline); #endif - return; + return NULL; } } @@ -1883,7 +1972,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 7 %d...\n",in_parse_inline); #endif - return; + return NULL; } /* @@ -1892,7 +1981,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode */ /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */ - if ( nodeIsA(node) && !node->implicit && + if ( nodeIsA(node) && !node->implicit && (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) { /* coerce to unless it has some attributes */ @@ -1920,7 +2009,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 8 %d...\n",in_parse_inline); #endif - return; + return NULL; } if (element->tag->model & CM_HEADING) @@ -2021,7 +2110,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode } - /* + /* if this is the end tag for an ancestor element then infer end tag for this element */ @@ -2035,7 +2124,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode if (!(element->tag->model & CM_OPT) && !element->implicit) TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); - if( TY_(IsPushedLast)( doc, element, node ) ) + if( TY_(IsPushedLast)( doc, element, node ) ) TY_(PopInline)( doc, element ); TY_(UngetToken)( doc ); @@ -2046,13 +2135,13 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 9 %d...\n",in_parse_inline); #endif - return; + return NULL; } } } /*\ - * block level tags end this element + * block level tags end this element * Issue #333 - There seems an exception if the element is a 'span', * and the node just collected is a 'meta'. The 'meta' can not have * CM_INLINE added, nor can the 'span' have CM_MIXED added without @@ -2098,7 +2187,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 10 %d...\n",in_parse_inline); #endif - return; + return NULL; } } @@ -2111,7 +2200,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 11 %d...\n",in_parse_inline); #endif - return; + return NULL; } /* parse inline element */ @@ -2123,7 +2212,7 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode /* trim white space before
*/ if ( nodeIsBR(node) ) TrimSpaces(doc, element); - + TY_(InsertNodeAtEnd)(element, node); ParseTag(doc, node, mode); continue; @@ -2142,36 +2231,20 @@ TY_PRIVATE void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode in_parse_inline--; SPRTF("Exit ParseInline 12 %d...\n",in_parse_inline); #endif + return NULL; } -void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode) -{ - Lexer* lexer = doc->lexer; - if ( lexer->isvoyager ) - { - Node *node = TY_(GetToken)( doc, mode); - if ( node ) - { - if ( !(node->type == EndTag && node->tag == element->tag) ) - { - /* TY_(Report)(doc, element, node, ELEMENT_NOT_EMPTY); */ - TY_(UngetToken)( doc ); - } - else - { - TY_(FreeNode)( doc, node ); - } - } - } -} -void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode) +/** MARK: TY_(oldParseDefList) + * Parses the `dl` tag. + */ +void* TY_(oldParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode) { Lexer* lexer = doc->lexer; Node *node, *parent; if (list->tag->model & CM_EMPTY) - return; + return NULL; lexer->insert = NULL; /* defer implicit inline start tags */ @@ -2181,7 +2254,7 @@ void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode) { TY_(FreeNode)( doc, node); list->closed = yes; - return; + return NULL; } /* deal with comments etc. */ @@ -2202,7 +2275,7 @@ void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode) continue; } - /* + /* if this is the end tag for an ancestor element then infer end tag for this element */ @@ -2233,7 +2306,7 @@ void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode) TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE); TY_(UngetToken)( doc ); - return; + return NULL; } } if (discardIt) @@ -2287,12 +2360,12 @@ void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode) if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) { TY_(Report)(doc, list, node, TAG_NOT_ALLOWED_IN); - return; + return NULL; } /* if DD appeared directly in BODY then exclude blocks */ if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks) - return; + return NULL; node = TY_(InferredTag)(doc, TidyTag_DD); TY_(Report)(doc, list, node, MISSING_STARTTAG); @@ -2304,27 +2377,21 @@ void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode) TY_(FreeNode)( doc, node); continue; } - + /* node should be

or
*/ TY_(InsertNodeAtEnd)(list, node); ParseTag( doc, node, IgnoreWhitespace); } TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR); + return NULL; } -static Bool FindLastLI( Node *list, Node **lastli ) -{ - Node *node; - *lastli = NULL; - for ( node = list->content; node ; node = node->next ) - if ( nodeIsLI(node) && node->type == StartTag ) - *lastli=node; - return *lastli ? yes:no; -} - -void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) +/** MARK: TY_(oldParseList) + * Parses list tags. + */ +void* TY_(oldParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) { #if defined(ENABLE_DEBUG_LOG) static int in_parse_list = 0; @@ -2344,7 +2411,7 @@ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) in_parse_list--; SPRTF("Exit ParseList 1 %d... CM_EMPTY\n",in_parse_list); #endif - return; + return NULL; } lexer->insert = NULL; /* defer implicit inline start tags */ @@ -2359,7 +2426,7 @@ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) in_parse_list--; SPRTF("Exit ParseList 2 %d... Endtag\n",in_parse_list); #endif - return; + return NULL; } /* deal with comments etc. */ @@ -2388,7 +2455,7 @@ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) } - /* + /* if this is the end tag for an ancestor element then infer end tag for this element */ @@ -2426,7 +2493,7 @@ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) in_parse_list--; SPRTF("Exit ParseList 3 %d... No End Tag\n",in_parse_list); #endif - return; + return NULL; } } @@ -2459,7 +2526,7 @@ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) in_parse_list--; SPRTF("Exit ParseList 4 %d... No End Tag\n",in_parse_list); #endif - return; + return NULL; } /* http://tidy.sf.net/issue/1316307 */ /* In exiled mode, return so table processing can continue. */ @@ -2471,12 +2538,12 @@ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) in_parse_list--; SPRTF("Exit ParseList 5 %d... exiled\n",in_parse_list); #endif - return; + return NULL; } /* http://tidy.sf.net/issue/836462 - If "list" is an unordered list, insert the next tag within - the last
  • to preserve the numbering to match the visual - rendering of most browsers. */ + If "list" is an unordered list, insert the next tag within + the last
  • to preserve the numbering to match the visual + rendering of most browsers. */ if ( nodeIsOL(list) && FindLastLI(list, &lastli) ) { /* Create a node for error reporting */ @@ -2490,12 +2557,12 @@ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) /* Add an inferred
  • */ wasblock = TY_(nodeHasCM)(node,CM_BLOCK); node = TY_(InferredTag)(doc, TidyTag_LI); - /* Add "display: inline" to avoid a blank line after
  • with + /* Add "display: inline" to avoid a blank line after
  • with Internet Explorer. See http://tidy.sf.net/issue/836462 */ TY_(AddStyleProperty)( doc, node, wasblock ? "list-style: none; display: inline" - : "list-style: none" + : "list-style: none" ); TY_(Report)(doc, list, node, MISSING_STARTTAG ); TY_(InsertNodeAtEnd)(list,node); @@ -2510,56 +2577,21 @@ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) in_parse_list--; SPRTF("Exit ParseList 6 %d... missing end tag\n",in_parse_list); #endif + return NULL; } -/* - unexpected content in table row is moved to just before - the table in accordance with Netscape and IE. This code - assumes that node hasn't been inserted into the row. -*/ -static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row, - Node *node ) -{ - Node *table; - /* first find the table element */ - for (table = row->parent; table; table = table->parent) - { - if ( nodeIsTABLE(table) ) - { - TY_(InsertNodeBeforeElement)( table, node ); - return; - } - } - /* No table element */ - TY_(InsertNodeBeforeElement)( row->parent, node ); -} - -/* - if a table row is empty then insert an empty cell - this practice is consistent with browser behavior - and avoids potential problems with row spanning cells -*/ -static void FixEmptyRow(TidyDocImpl* doc, Node *row) -{ - Node *cell; - - if (row->content == NULL) - { - cell = TY_(InferredTag)(doc, TidyTag_TD); - TY_(InsertNodeAtEnd)(row, cell); - TY_(Report)(doc, row, cell, MISSING_STARTTAG); - } -} - -void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode)) +/** MARK: TY_(oldParseRow) + * Parses the `row` tag. + */ +void* TY_(oldParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode)) { Lexer* lexer = doc->lexer; Node *node; Bool exclude_state; if (row->tag->model & CM_EMPTY) - return; + return NULL; while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) { @@ -2570,16 +2602,16 @@ void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode)) TY_(FreeNode)( doc, node); row->closed = yes; FixEmptyRow( doc, row); - return; + return NULL; } /* New row start implies end of current row */ TY_(UngetToken)( doc ); FixEmptyRow( doc, row); - return; + return NULL; } - /* + /* if this is the end tag for an ancestor element then infer end tag for this element */ @@ -2589,7 +2621,7 @@ void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode)) && DescendantOf(row, TagId(node)) ) { TY_(UngetToken)( doc ); - return; + return NULL; } if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) @@ -2634,7 +2666,7 @@ void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode)) if ( TY_(nodeHasCM)(node, CM_ROWGRP) ) { TY_(UngetToken)( doc ); - return; + return NULL; } if (node->type == EndTag) @@ -2687,7 +2719,7 @@ void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode)) TY_(FreeNode)( doc, node); continue; } - + /* node should be
  • or */ TY_(InsertNodeAtEnd)(row, node); exclude_state = lexer->excludeBlocks; @@ -2700,16 +2732,20 @@ void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode)) while ( lexer->istacksize > lexer->istackbase ) TY_(PopInline)( doc, NULL ); } - + return NULL; } -void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode)) + +/** MARK: TY_(oldParseRowGroup) + * Parses the `rowgroup` tag. + */ +void* TY_(oldParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode)) { Lexer* lexer = doc->lexer; Node *node, *parent; if (rowgroup->tag->model & CM_EMPTY) - return; + return NULL; while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) { @@ -2719,18 +2755,18 @@ void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSE { rowgroup->closed = yes; TY_(FreeNode)( doc, node); - return; + return NULL; } TY_(UngetToken)( doc ); - return; + return NULL; } /* if
    infer end tag */ if ( nodeIsTABLE(node) && node->type == EndTag ) { TY_(UngetToken)( doc ); - return; + return NULL; } /* deal with comments etc. */ @@ -2780,7 +2816,7 @@ void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSE } } - /* + /* if this is the end tag for ancestor element then infer end tag for this element */ @@ -2810,7 +2846,7 @@ void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSE if (node->tag == parent->tag) { TY_(UngetToken)( doc ); - return; + return NULL; } } } @@ -2824,7 +2860,7 @@ void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSE if (node->type != EndTag) { TY_(UngetToken)( doc ); - return; + return NULL; } } @@ -2834,7 +2870,7 @@ void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSE TY_(FreeNode)( doc, node); continue; } - + if ( !nodeIsTR(node) ) { node = TY_(InferredTag)(doc, TidyTag_TR); @@ -2846,15 +2882,19 @@ void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSE TY_(InsertNodeAtEnd)(rowgroup, node); ParseTag(doc, node, IgnoreWhitespace); } - + return NULL; } -void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode)) + +/** MARK: TY_(oldParseColGroup) + * Parses the `colgroup` tag. + */ +void* TY_(oldParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode)) { Node *node, *parent; if (colgroup->tag->model & CM_EMPTY) - return; + return NULL; while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) { @@ -2862,10 +2902,10 @@ void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSE { TY_(FreeNode)( doc, node); colgroup->closed = yes; - return; + return NULL; } - /* + /* if this is the end tag for an ancestor element then infer end tag for this element */ @@ -2886,7 +2926,7 @@ void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSE if (node->tag == parent->tag) { TY_(UngetToken)( doc ); - return; + return NULL; } } } @@ -2894,7 +2934,7 @@ void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSE if (TY_(nodeIsText)(node)) { TY_(UngetToken)( doc ); - return; + return NULL; } /* deal with comments etc. */ @@ -2912,7 +2952,7 @@ void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSE if ( !nodeIsCOL(node) ) { TY_(UngetToken)( doc ); - return; + return NULL; } if (node->type == EndTag) @@ -2921,14 +2961,19 @@ void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSE TY_(FreeNode)( doc, node); continue; } - + /* node should be */ TY_(InsertNodeAtEnd)(colgroup, node); ParseTag(doc, node, IgnoreWhitespace); } + return NULL; } -void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode)) + +/** MARK: TY_(oldParseTableTag) + * Parses the `table` tag. + */ +void* TY_(oldParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode)) { #if defined(ENABLE_DEBUG_LOG) static int in_parse_table = 0; @@ -2944,7 +2989,7 @@ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(m in_parse_table++; SPRTF("Entering ParseTableTag %d...\n",in_parse_table); #endif - + while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) { if (node->tag == table->tag ) @@ -2956,7 +3001,7 @@ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(m else { /* Issue #498 - If a in a
    - * just close the current table, and issue a + * just close the current table, and issue a * warning. The previous action was to discard * this second
    */ @@ -2969,7 +3014,7 @@ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(m in_parse_table--; SPRTF("Exit ParseTableTag 1 %d... EndTag\n",in_parse_table); #endif - return; + return NULL; } /* deal with comments etc. */ @@ -3000,7 +3045,7 @@ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(m TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN); lexer->exiled = yes; - if (node->type != TextNode) + if (node->type != TextNode) ParseTag(doc, node, IgnoreWhitespace); lexer->exiled = no; @@ -3013,7 +3058,7 @@ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(m } } - /* + /* if this is the end tag for an ancestor element then infer end tag for this element */ @@ -3049,7 +3094,7 @@ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(m in_parse_table--; SPRTF("Exit ParseTableTag 2 %d... missing EndTag\n",in_parse_table); #endif - return; + return NULL; } } } @@ -3063,7 +3108,7 @@ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(m in_parse_table--; SPRTF("Exit ParseTableTag 3 %d... CM_TABLE\n",in_parse_table); #endif - return; + return NULL; } if (TY_(nodeIsElement)(node)) @@ -3084,35 +3129,25 @@ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(m in_parse_table--; SPRTF("Exit ParseTableTag 4 %d... missing end\n",in_parse_table); #endif + return NULL; } -/* acceptable content for pre elements */ -static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node ) -{ - /* p is coerced to br's, Text OK too */ - if ( nodeIsP(node) || TY_(nodeIsText)(node) ) - return yes; - if ( node->tag == NULL || - nodeIsPARAM(node) || - !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) ) - return no; - - return yes; -} - -void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) +/** MARK: TY_(oldParsePre) + * Parses the `pre` tag. + */ +void* TY_(oldParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) { Node *node; if (pre->tag->model & CM_EMPTY) - return; + return NULL; TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */ while ((node = TY_(GetToken)(doc, Preformatted)) != NULL) { - if ( node->type == EndTag && + if ( node->type == EndTag && (node->tag == pre->tag || DescendantOf(pre, TagId(node))) ) { if (nodeIsBODY(node) || nodeIsHTML(node)) @@ -3132,7 +3167,7 @@ void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) } pre->closed = yes; TrimSpaces(doc, pre); - return; + return NULL; } if (TY_(nodeIsText)(node)) @@ -3160,13 +3195,13 @@ void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) /* fix for http://tidy.sf.net/bug/772205 */ if (node->type == EndTag) { - /* http://tidy.sf.net/issue/1590220 */ + /* http://tidy.sf.net/issue/1590220 */ if ( doc->lexer->exiled && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) { TY_(UngetToken)(doc); TrimSpaces(doc, pre); - return; + return NULL; } TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); @@ -3182,7 +3217,7 @@ void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE); TY_(UngetToken)(doc); - return; + return NULL; } /* @@ -3211,7 +3246,7 @@ void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) )
    ...
    ...
    ...
    (Internet Explorer)
    ...

    ...

    ...
    (Mozilla, Opera 6)
    ...
    ...

    ...
    (Opera 7) - + or something similar, they could also be closing the
     and propagate
                   the 
     into the newly opened 

    . @@ -3242,7 +3277,7 @@ void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) /* trim white space before

    in

    */
                     TrimSpaces(doc, pre);
    -            
    +
                     /* coerce both 

    and

    to
    */ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */ @@ -3261,7 +3296,7 @@ void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) /* trim white space before
    */ if ( nodeIsBR(node) ) TrimSpaces(doc, pre); - + TY_(InsertNodeAtEnd)(pre, node); ParseTag(doc, node, Preformatted); continue; @@ -3273,9 +3308,14 @@ void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) } TY_(Report)(doc, pre, node, MISSING_ENDTAG_FOR); + return NULL; } -void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) + +/** MARK: TY_(oldParseOptGroup) + * Parses the `optgroup` tag. + */ +void* TY_(oldParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) { Lexer* lexer = doc->lexer; Node *node; @@ -3289,14 +3329,14 @@ void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(m TY_(FreeNode)( doc, node); field->closed = yes; TrimSpaces(doc, field); - return; + return NULL; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; - if ( node->type == StartTag && + if ( node->type == StartTag && (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) ) { if ( nodeIsOPTGROUP(node) ) @@ -3311,10 +3351,14 @@ void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(m TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node); } + return NULL; } -void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) +/** MARK: TY_(oldParseSelect) + * Parses the `select` tag. + */ +void* TY_(oldParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) { #if defined(ENABLE_DEBUG_LOG) static int in_parse_select = 0; @@ -3339,18 +3383,18 @@ void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mod in_parse_select--; SPRTF("Exit ParseSelect 1 %d...\n",in_parse_select); #endif - return; + return NULL; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; - if ( node->type == StartTag && + if ( node->type == StartTag && ( nodeIsOPTION(node) || nodeIsOPTGROUP(node) || nodeIsDATALIST(node) || - nodeIsSCRIPT(node)) + nodeIsSCRIPT(node)) ) { TY_(InsertNodeAtEnd)(field, node); @@ -3368,10 +3412,14 @@ void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mod in_parse_select--; SPRTF("Exit ParseSelect 2 %d...\n",in_parse_select); #endif + return NULL; } -/* HTML5 */ -void TY_(ParseDatalist)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) + +/** MARK: TY_(oldParseDataList) + * Parses the `datalist` tag. + */ +void* TY_(oldParseDatalist)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) { #if defined(ENABLE_DEBUG_LOG) static int in_parse_datalist = 0; @@ -3396,18 +3444,18 @@ void TY_(ParseDatalist)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(m in_parse_datalist--; SPRTF("Exit ParseDatalist 1 %d...\n",in_parse_datalist); #endif - return; + return NULL; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; - if ( node->type == StartTag && + if ( node->type == StartTag && ( nodeIsOPTION(node) || nodeIsOPTGROUP(node) || nodeIsDATALIST(node) || - nodeIsSCRIPT(node)) + nodeIsSCRIPT(node)) ) { TY_(InsertNodeAtEnd)(field, node); @@ -3425,12 +3473,14 @@ void TY_(ParseDatalist)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(m in_parse_datalist--; SPRTF("Exit ParseDatalist 2 %d...\n",in_parse_datalist); #endif + return NULL; } - - -void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode) +/** MARK: TY_(oldParseText) + * Parses the `option` and `textarea` tags. + */ +void* TY_(oldParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode) { Lexer* lexer = doc->lexer; Node *node; @@ -3449,7 +3499,7 @@ void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode) TY_(FreeNode)( doc, node); field->closed = yes; TrimSpaces(doc, field); - return; + return NULL; } /* deal with comments etc. */ @@ -3475,7 +3525,7 @@ void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode) /* for textarea should all cases of < and & be escaped? */ /* discard inline tags e.g. font */ - if ( node->tag + if ( node->tag && node->tag->model & CM_INLINE && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */ { @@ -3490,15 +3540,19 @@ void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode) TY_(UngetToken)( doc ); TrimSpaces(doc, field); - return; + return NULL; } if (!(field->tag->model & CM_OPT)) TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR); + return NULL; } -void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode)) +/** MARK: TY_(oldParseTitle) + * Parses the `title` tag. + */ +void* TY_(oldParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode)) { Node *node; while ((node = TY_(GetToken)(doc, MixedContent)) != NULL) @@ -3516,7 +3570,7 @@ void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode TY_(FreeNode)( doc, node); title->closed = yes; TrimSpaces(doc, title); - return; + return NULL; } if (TY_(nodeIsText)(node)) @@ -3551,23 +3605,29 @@ void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode TY_(Report)(doc, title, node, MISSING_ENDTAG_BEFORE); TY_(UngetToken)( doc ); TrimSpaces(doc, title); - return; + return NULL; } TY_(Report)(doc, title, node, MISSING_ENDTAG_FOR); + return NULL; } -/* - This isn't quite right for CDATA content as it recognises - tags within the content and parses them accordingly. - This will unfortunately screw up scripts which include - < + letter, < + !, < + ? or < + / + letter -*/ -void TY_(ParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode)) +/** MARK: TY_(oldParseScript) + * Parses the `script` tag. + * + * @todo This isn't quite right for CDATA content as it recognises tags + * within the content and parses them accordingly. This will unfortunately + * screw up scripts which include: + * < + letter + * < + ! + * < + ? + * < + / + letter + */ +void* TY_(oldParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode)) { Node *node; - + doc->lexer->parent = script; node = TY_(GetToken)(doc, CdataContent); doc->lexer->parent = NULL; @@ -3580,7 +3640,7 @@ void TY_(ParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mo { /* handle e.g. a document like "