/* parser.c -- HTML Parser (c) 1998-2007 (W3C) MIT, ERCIM, Keio University See tidy.h for the copyright notice. */ #include "tidy-int.h" #include "lexer.h" #include "parser.h" #include "message.h" #include "clean.h" #include "tags.h" #include "tmbstr.h" #include "sprtf.h" /****************************************************************************//* ** MARK: - Configuration Options ***************************************************************************/ /** * Issue #72 - Need to know to avoid error-reporting - no warning only if * --show-body-only yes. * Issue #132 - Likewise avoid warning if showing body only. */ #define showingBodyOnly(doc) (cfgAutoBool(doc,TidyBodyOnly) == TidyYesState) ? yes : no /****************************************************************************//* ** MARK: - Forward Declarations ***************************************************************************/ static Node* ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode); /****************************************************************************//* ** MARK: - Node Operations ***************************************************************************/ /** * Generalised search for duplicate elements. * Issue #166 - repeated
element. */ static Bool findNodeWithId( Node *node, TidyTagId tid ) { Node *content; while (node) { if (TagIsId(node,tid)) return yes; /*\ * Issue #459 - Under certain circumstances, with many node this use of * 'for (content = node->content; content; content = content->content)' * would produce a **forever** circle, or at least a very extended loop... * It is sufficient to test the content, if it exists, * to quickly iterate all nodes. Now all nodes are tested only once. \*/ content = node->content; if (content) { if ( findNodeWithId(content,tid) ) return yes; } node = node->next; } return no; } /** * Perform a global search for an element. * Issue #166 - repeated
element */ static Bool findNodeById( TidyDocImpl* doc, TidyTagId tid ) { Node *node = (doc ? doc->root.content : NULL); return findNodeWithId( node,tid ); } /** * Inserts node into element at an appropriate location based * on the type of node being inserted. */ static Bool InsertMisc(Node *element, Node *node) { if (node->type == CommentTag || node->type == ProcInsTag || node->type == CDATATag || node->type == SectionTag || node->type == AspTag || node->type == JsteTag || node->type == PhpTag ) { TY_(InsertNodeAtEnd)(element, node); return yes; } if ( node->type == XmlDecl ) { Node* root = element; while ( root && root->parent ) root = root->parent; if ( root && !(root->content && root->content->type == XmlDecl)) { TY_(InsertNodeAtStart)( root, node ); return yes; } } /* Declared empty tags seem to be slipping through ** the cracks. This is an experiment to figure out ** a decent place to pick them up. */ if ( node->tag && TY_(nodeIsElement)(node) && TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN && (node->tag->versions & VERS_PROPRIETARY) != 0 ) { TY_(InsertNodeAtEnd)(element, node); return yes; } return no; } /** * Insert "node" into markup tree in place of "element" * which is moved to become the child of the node */ static void InsertNodeAsParent(Node *element, Node *node) { node->content = element; node->last = element; node->parent = element->parent; element->parent = node; if (node->parent->content == element) node->parent->content = node; if (node->parent->last == element) node->parent->last = node; node->prev = element->prev; element->prev = NULL; if (node->prev) node->prev->next = node; node->next = element->next; element->next = NULL; if (node->next) node->next->prev = node; } /** * Unexpected content in table row is moved to just before the table in * in accordance with Netscape and IE. This code assumes that node hasn't * been inserted into the row. */ static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row, Node *node ) { Node *table; /* first find the table element */ for (table = row->parent; table; table = table->parent) { if ( nodeIsTABLE(table) ) { TY_(InsertNodeBeforeElement)( table, node ); return; } } /* No table element */ TY_(InsertNodeBeforeElement)( row->parent, node ); } /** * Moves given node to end of body element. */ static void MoveNodeToBody( TidyDocImpl* doc, Node* node ) { Node* body = TY_(FindBody)( doc ); if ( body ) { TY_(RemoveNode)( node ); TY_(InsertNodeAtEnd)( body, node ); } } /** * Move node to the head, where element is used as starting * point in hunt for head. Normally called during parsing. */ static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node ) { Node *head = NULL; TY_(RemoveNode)( node ); /* make sure that node is isolated */ if ( TY_(nodeIsElement)(node) ) { TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN ); head = TY_(FindHEAD)(doc); assert(head != NULL); TY_(InsertNodeAtEnd)(head, node); if ( node->tag->parser ) { /* Only one of the existing test cases as of 2021-08-14 invoke MoveToHead, and it doesn't go deeper than one level. The parser() call is supposed to return a node if additional parsing is needed. Keep this in mind if we start to get bug reports. */ Parser* parser = node->tag->parser; parser( doc, node, IgnoreWhitespace ); } } else { TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node ); } } /***************************************************************************//* ** MARK: - Decision Making ***************************************************************************/ /** * Indicates whether or not element can be pruned based on content, * user settings, etc. */ static Bool CanPrune( TidyDocImpl* doc, Node *element ) { if ( !cfgBool(doc, TidyDropEmptyElems) ) return no; if ( TY_(nodeIsText)(element) ) return yes; if ( element->content ) return no; if ( element->tag == NULL ) return no; if ( element->tag->model & CM_BLOCK && element->attributes != NULL ) return no; if ( nodeIsA(element) && element->attributes != NULL ) return no; if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) ) return no; if ( element->tag->model & CM_ROW ) return no; if ( element->tag->model & CM_EMPTY ) return no; if ( nodeIsAPPLET(element) ) return no; if ( nodeIsOBJECT(element) ) return no; if ( nodeIsSCRIPT(element) && attrGetSRC(element) ) return no; if ( nodeIsTITLE(element) ) return no; /* #433359 - fix by Randy Waki 12 Mar 01 */ if ( nodeIsIFRAME(element) ) return no; /* fix for bug 770297 */ if (nodeIsTEXTAREA(element)) return no; /* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */ if (nodeIsCANVAS(element)) return no; if (nodeIsPROGRESS(element)) return no; if ( attrGetID(element) || attrGetNAME(element) ) return no; /* fix for bug 695408; a better fix would look for unknown and */ /* known proprietary attributes that make the element significant */ if (attrGetDATAFLD(element)) return no; /* fix for bug 723772, don't trim new-...-tags */ if (element->tag->id == TidyTag_UNKNOWN) return no; if (nodeIsBODY(element)) return no; if (nodeIsCOLGROUP(element)) return no; /* HTML5 - do NOT drop empty option if it has attributes */ if ( nodeIsOPTION(element) && element->attributes != NULL ) return no; /* fix for #103 - don't drop empty dd tags lest document not validate */ if (nodeIsDD(element)) return no; return yes; } /** * Indicates whether or not node is a descendant of a tag of the given tid. */ static Bool DescendantOf( Node *element, TidyTagId tid ) { Node *parent; for ( parent = element->parent; parent != NULL; parent = parent->parent ) { if ( TagIsId(parent, tid) ) return yes; } return no; } /** * Indicates whether or not node is a descendant of a pre tag. */ static Bool IsPreDescendant(Node* node) { Node *parent = node->parent; while (parent) { if (parent->tag && parent->tag->parser == TY_(ParsePre)) return yes; parent = parent->parent; } return no; } /** * Indicates whether or not the only content model for the given node * is CM_INLINE. */ static Bool nodeCMIsOnlyInline( Node* node ) { return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK ); } /** * Indicates whether or not the content of the given node is acceptable * content for pre elements */ static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node ) { /* p is coerced to br's, Text OK too */ if ( nodeIsP(node) || TY_(nodeIsText)(node) ) return yes; if ( node->tag == NULL || nodeIsPARAM(node) || !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) ) return no; return yes; } /** * Indicates whether or not leading whitespace should be cleaned. */ static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node) { if (!TY_(nodeIsText)(node)) return no; if (node->parent->type == DocTypeTag) return no; if (IsPreDescendant(node)) return no; if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) return no; /* #523, prevent blank spaces after script if the next item is script. * This is actually more generalized as, if the preceding element is * a body level script, then indicate that we want to clean leading * whitespace. */ if ( node->prev && nodeIsSCRIPT(node->prev) && nodeIsBODY(node->prev->parent) ) return yes; /*

...
......

*/ if (nodeIsBR(node->prev)) return yes; /*

...

*/ if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE)) return yes; /*

...

... */ if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) && TY_(nodeIsElement)(node->prev)) return yes; /*

...

*/ if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE)) return yes; return no; } /** * Indicates whether or not trailing whitespace should be cleaned. */ static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) { Node* next; if (!TY_(nodeIsText)(node)) return no; if (node->parent->type == DocTypeTag) return no; if (IsPreDescendant(node)) return no; if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) return no; /* #523, prevent blank spaces after script if the next item is script. * This is actually more generalized as, if the next element is * a body level script, then indicate that we want to clean trailing * whitespace. */ if ( node->next && nodeIsSCRIPT(node->next) && nodeIsBODY(node->next->parent) ) return yes; next = node->next; /*

...

*/ if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE)) return yes; /*
...

...

*/ if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE)) return yes; if (!next) return no; if (nodeIsBR(next)) return yes; if (TY_(nodeHasCM)(next, CM_INLINE)) return no; /* ...

...

*/ if (next->type == StartTag) return yes; /* ...
*/ if (next->type == StartEndTag) return yes; /* evil adjacent text nodes, Tidy should not generate these :-( */ if (TY_(nodeIsText)(next) && next->start < next->end && TY_(IsWhite)(doc->lexer->lexbuf[next->start])) return yes; return no; } /***************************************************************************//* ** MARK: - Information Accumulation ***************************************************************************/ /** * Errors in positioning of form start or end tags * generally require human intervention to fix. * Issue #166 - repeated
element also uses this flag * to indicate duplicates, discarded. */ static void BadForm( TidyDocImpl* doc ) { doc->badForm |= flg_BadForm; } /***************************************************************************//* ** MARK: - Fixes and Touchup ***************************************************************************/ /** * Adds style information as a class in the document or a property * of the node to prevent indentation of inferred UL tags. */ static void AddClassNoIndent( TidyDocImpl* doc, Node *node ) { ctmbstr sprop = "padding-left: 2ex; margin-left: 0ex" "; margin-top: 0ex; margin-bottom: 0ex"; if ( !cfgBool(doc, TidyDecorateInferredUL) ) return; if ( cfgBool(doc, TidyMakeClean) ) TY_(AddStyleAsClass)( doc, node, sprop ); else TY_(AddStyleProperty)( doc, node, sprop ); } /** * Cleans whitespace from text nodes, and drops such nodes if emptied * completely as a result. */ static void CleanSpaces(TidyDocImpl* doc, Node* node) { Stack *stack = TY_(newStack)(doc, 16); Node *next; while (node) { next = node->next; if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node)) while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start])) ++(node->start); if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node)) while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1])) --(node->end); if (TY_(nodeIsText)(node) && !(node->start < node->end)) { TY_(RemoveNode)(node); TY_(FreeNode)(doc, node); node = next ? next : TY_(pop)(stack); continue; } if (node->content) { TY_(push)(stack, next); node = node->content; continue; } node = next ? next : TY_(pop)(stack); } TY_(freeStack)(stack); } /** * If a table row is empty then insert an empty cell. This practice is * consistent with browser behavior and avoids potential problems with * row spanning cells. */ static void FixEmptyRow(TidyDocImpl* doc, Node *row) { Node *cell; if (row->content == NULL) { cell = TY_(InferredTag)(doc, TidyTag_TD); TY_(InsertNodeAtEnd)(row, cell); TY_(Report)(doc, row, cell, MISSING_STARTTAG); } } /** * The doctype has been found after other tags, * and needs moving to before the html element */ static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype ) { Node* existing = TY_(FindDocType)( doc ); if ( existing ) { TY_(Report)(doc, element, doctype, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, doctype ); } else { TY_(Report)(doc, element, doctype, DOCTYPE_AFTER_TAGS ); while ( !nodeIsHTML(element) ) element = element->parent; TY_(InsertNodeBeforeElement)( element, doctype ); } } /** * This maps *

hello world * to *

hello world * * Trims initial space, by moving it before the * start tag, or if this element is the first in * parent's content, then by discarding the space */ static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text ) { Lexer* lexer = doc->lexer; Node *prev, *node; if ( TY_(nodeIsText)(text) && lexer->lexbuf[text->start] == ' ' && text->start < text->end ) { if ( (element->tag->model & CM_INLINE) && !(element->tag->model & CM_FIELD) ) { prev = element->prev; if (TY_(nodeIsText)(prev)) { if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ') lexer->lexbuf[(prev->end)++] = ' '; ++(element->start); } else /* create new node */ { node = TY_(NewNode)(lexer->allocator, lexer); node->start = (element->start)++; node->end = element->start; lexer->lexbuf[node->start] = ' '; TY_(InsertNodeBeforeElement)(element ,node); DEBUG_LOG(SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n", (element->element ? element->element : "unknown"))); } } /* discard the space in current node */ ++(text->start); } } /** * This maps * hello world * to * hello world * * If last child of element is a text node * then trim trailing white space character * moving it to after element's end tag. */ static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last ) { Lexer* lexer = doc->lexer; byte c; if (TY_(nodeIsText)(last)) { if (last->end > last->start) { c = (byte) lexer->lexbuf[ last->end - 1 ]; if ( c == ' ' ) { last->end -= 1; if ( (element->tag->model & CM_INLINE) && !(element->tag->model & CM_FIELD) ) lexer->insertspace = yes; } } } } /** * Move initial and trailing space out. * This routine maps: * hello world * to * hello world * and * hello world * to * hello world */ static void TrimSpaces( TidyDocImpl* doc, Node *element) { Node* text = element->content; if (nodeIsPRE(element) || IsPreDescendant(element)) return; if (TY_(nodeIsText)(text)) TrimInitialSpace(doc, element, text); text = element->last; if (TY_(nodeIsText)(text)) TrimTrailingSpace(doc, element, text); } /***************************************************************************//* ** MARK: - Parsers Support ***************************************************************************/ /** * Structure used by FindDescendant_cb. */ struct MatchingDescendantData { Node *found_node; Bool *passed_marker_node; /* input: */ TidyTagId matching_tagId; Node *node_to_find; Node *marker_node; }; /** * The main engine for FindMatchingDescendant. */ static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate) { struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate; if (TagId(node) == cb_data->matching_tagId) { /* make sure we match up 'unknown' tags exactly! */ if (cb_data->matching_tagId != TidyTag_UNKNOWN || (node->element != NULL && cb_data->node_to_find != NULL && cb_data->node_to_find->element != NULL && 0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element))) { cb_data->found_node = node; return ExitTraversal; } } if (cb_data->passed_marker_node && node == cb_data->marker_node) *cb_data->passed_marker_node = yes; return VisitParent; } /** * Search the parent chain (from `parent` upwards up to the root) for a node * matching the given 'node'. * * When the search passes beyond the `marker_node` (which is assumed to sit * in the parent chain), this will be flagged by setting the boolean * referenced by `is_parent_of_marker` to `yes`. * * 'is_parent_of_marker' and 'marker_node' are optional parameters and may * be NULL. */ static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker ) { struct MatchingDescendantData cb_data = { 0 }; cb_data.matching_tagId = TagId(node); cb_data.node_to_find = node; cb_data.marker_node = marker_node; assert(node); if (is_parent_of_marker) *is_parent_of_marker = no; TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data); return cb_data.found_node; } /** * Finds the last list item for the given list, providing it in the * in-out parameter. Returns yes or no if the item was the last list * item. */ static Bool FindLastLI( Node *list, Node **lastli ) { Node *node; *lastli = NULL; for ( node = list->content; node ; node = node->next ) if ( nodeIsLI(node) && node->type == StartTag ) *lastli=node; return *lastli ? yes:no; } /***************************************************************************//* ** MARK: - Parser Stack ***************************************************************************/ /** * Allocates and initializes the parser's stack. */ void TY_(InitParserStack)( TidyDocImpl* doc ) { enum { default_size = 32 }; TidyParserMemory *content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * default_size ); doc->stack.content = content; doc->stack.size = default_size; doc->stack.top = -1; } /** * Frees the parser's stack when done. */ void TY_(FreeParserStack)( TidyDocImpl* doc ) { TidyFree( doc->allocator, doc->stack.content ); doc->stack.content = NULL; doc->stack.size = 0; doc->stack.top = -1; } /** * Increase the stack size. */ static void growParserStack( TidyDocImpl* doc ) { TidyParserMemory *content; content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * doc->stack.size * 2 ); memcpy( content, doc->stack.content, sizeof(TidyParserMemory) * (doc->stack.top + 1) ); TidyFree(doc->allocator, doc->stack.content); doc->stack.content = content; doc->stack.size = doc->stack.size * 2; } /** * Indicates whether or not the stack is empty. */ Bool TY_(isEmptyParserStack)( TidyDocImpl* doc ) { return doc->stack.top < 0; } /** * Peek at the parser memory. */ TidyParserMemory TY_(peekMemory)( TidyDocImpl* doc ) { return doc->stack.content[doc->stack.top]; } /** * Peek at the parser memory "identity" field. This is just a convenience * to avoid having to create a new struct instance in the caller. */ Parser* TY_(peekMemoryIdentity)( TidyDocImpl* doc ) { return doc->stack.content[doc->stack.top].identity; } /** * Peek at the parser memory "mode" field. This is just a convenience * to avoid having to create a new struct instance in the caller. */ GetTokenMode TY_(peekMemoryMode)( TidyDocImpl* doc ) { return doc->stack.content[doc->stack.top].mode; } /** * Pop out a parser memory. */ TidyParserMemory TY_(popMemory)( TidyDocImpl* doc ) { if ( !TY_(isEmptyParserStack)( doc ) ) { TidyParserMemory data = doc->stack.content[doc->stack.top]; DEBUG_LOG(SPRTF("\n" "<--POP original: %s @ %p\n" " reentry: %s @ %p\n" " stack depth: %lu @ %p\n" " mode: %u\n" " register 1: %i\n" " register 2: %i\n\n", data.original_node ? data.original_node->element : "none", data.original_node, data.reentry_node ? data.reentry_node->element : "none", data.reentry_node, doc->stack.top, &doc->stack.content[doc->stack.top], data.mode, data.register_1, data.register_2 )); doc->stack.top = doc->stack.top - 1; return data; } TidyParserMemory blank = { NULL }; return blank; } /** * Push the parser memory to the stack. */ void TY_(pushMemory)( TidyDocImpl* doc, TidyParserMemory data ) { if ( doc->stack.top == doc->stack.size - 1 ) growParserStack( doc ); doc->stack.top++; doc->stack.content[doc->stack.top] = data; DEBUG_LOG(SPRTF("\n" "-->PUSH original: %s @ %p\n" " reentry: %s @ %p\n" " stack depth: %lu @ %p\n" " mode: %u\n" " register 1: %i\n" " register 2: %i\n\n", data.original_node ? data.original_node->element : "none", data.original_node, data.reentry_node ? data.reentry_node->element : "none", data.reentry_node, doc->stack.top, &doc->stack.content[doc->stack.top], data.mode, data.register_1, data.register_2 )); } /***************************************************************************//* ** MARK: Convenience Logging Macros ***************************************************************************/ #if defined(ENABLE_DEBUG_LOG) # define DEBUG_LOG_COUNTERS \ static int depth_parser = 0;\ static int count_parser = 0;\ int old_mode = IgnoreWhitespace; # define DEBUG_LOG_GET_OLD_MODE old_mode = mode; # define DEBUG_LOG_REENTER_WITH_NODE(NODE) SPRTF("\n>>>Re-Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser); # define DEBUG_LOG_ENTER_WITH_NODE(NODE) SPRTF("\n>>>Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser); # define DEBUG_LOG_CHANGE_MODE SPRTF("+++%s-%u Changing mode to %u (was %u)\n", __FUNCTION__, __LINE__, mode, old_mode); # define DEBUG_LOG_GOT_TOKEN(NODE) SPRTF("---%s-%u got token '%s' with mode '%u'.\n", __FUNCTION__, __LINE__, NODE ? NODE->element : NULL, mode); # define DEBUG_LOG_EXIT_WITH_NODE(NODE) SPRTF("<<element, depth_parser--); # define DEBUG_LOG_EXIT SPRTF("<<lexer; if ( cfgBool( doc, TidyXmlTags ) ) return ParseXMLElement; /* [i_a]2 prevent crash for active content (php, asp) docs */ if (!node || node->tag == NULL) return NULL; /* Fix by GLP 2000-12-21. Need to reset insertspace if this is both a non-inline and empty tag (base, link, meta, isindex, hr, area). */ if (node->tag->model & CM_EMPTY) { lexer->waswhite = no; if (node->tag->parser == NULL) return NULL; } else if (!(node->tag->model & CM_INLINE)) lexer->insertspace = no; if (node->tag->parser == NULL) return NULL; if (node->type == StartEndTag) return NULL; /* [i_a]2 added this - not sure why - CHECKME: */ lexer->parent = node; return (node->tag->parser); } /** * This parser controller initiates the parsing process with the document's * root starting with the provided node, which should be the HTML node after * the pre-HTML stuff is handled at a higher level. * * This controller is responsible for calling each of the individual parsers, * based on the tokens it pulls from the lexer, or the tokens passed back via * the parserMemory stack from each of the parsers. Having a main, central * looping dispatcher in this fashion allows the prevention of recursion. */ void ParseHTMLWithNode( TidyDocImpl* doc, Node* node ) { GetTokenMode mode = IgnoreWhitespace; Parser* parser = GetParserForNode( doc, node ); Bool something_to_do = yes; /* This main loop is only extinguished when all of the parser tokens are consumed. Ideally, EVERY parser will return nodes to this loop for dispatch to the appropriate parser, but some of the recursive parsers still consume some tokens on their own. */ while (something_to_do) { node = parser ? parser( doc, node, mode ) : NULL; /* We have a node, so anything deferred was already pushed to the stack to be dealt with later. */ if ( node ) { parser = GetParserForNode( doc, node ); continue; } /* We weren't given a node, which means this particular leaf is bottomed out. We'll re-enter the parsers using information from the stack. */ if ( !TY_(isEmptyParserStack)(doc)) { parser = TY_(peekMemoryIdentity)(doc); if (parser) { continue; } else { /* No parser means we're only passing back a parsing mode. */ mode = TY_(peekMemoryMode)( doc ); TY_(popMemory)( doc ); } } /* At this point, there's nothing being returned from parsers, and nothing on the stack, so we can draw a new node from the lexer. */ node = TY_(GetToken)( doc, mode ); DEBUG_LOG_GOT_TOKEN(node); if (node) parser = GetParserForNode( doc, node ); else something_to_do = no; } } /***************************************************************************//* ** MARK: - Parsers ***************************************************************************/ /** MARK: TY_(ParseBlock) * `element` is a node created by the lexer upon seeing the start tag, or * by the parser when the start tag is inferred * * This is a non-recursing parser. It uses the document's parser memory stack * to send subsequent nodes back to the controller for dispatching to parsers. * This parser is also re-enterable, so that post-processing can occur after * such dispatching. */ Node* TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode ) { Lexer* lexer = doc->lexer; Node *node = NULL; Bool checkstack = yes; uint istackbase = 0; DEBUG_LOG_COUNTERS; if ( element == NULL ) { TidyParserMemory memory = TY_(popMemory)( doc ); node = memory.reentry_node; /* Throwaway, because the loop overwrites this immediately. */ DEBUG_LOG_REENTER_WITH_NODE(node); element = memory.original_node; DEBUG_LOG_GET_OLD_MODE; mode = memory.reentry_mode; DEBUG_LOG_CHANGE_MODE; } else { DEBUG_LOG_ENTER_WITH_NODE(element); if ( element->tag->model & CM_EMPTY ) { DEBUG_LOG_EXIT; return NULL; } if ( nodeIsDIV(element) && nodeIsDL(element->parent) && TY_(IsHTML5Mode)(doc) ) { DEBUG_LOG_EXIT; return TY_(ParseDefList)(doc, element, mode); /* @warning: possible recursion! */ } if ( nodeIsFORM(element) && DescendantOf(element, TidyTag_FORM) ) { TY_(Report)(doc, element, NULL, ILLEGAL_NESTING ); } /* InlineDup() asks the lexer to insert inline emphasis tags currently pushed on the istack, but take care to avoid propagating inline emphasis inside OBJECT or APPLET. For these elements a fresh inline stack context is created and disposed of upon reaching the end of the element. They thus behave like table cells in this respect. */ if (element->tag->model & CM_OBJECT) { istackbase = lexer->istackbase; lexer->istackbase = lexer->istacksize; } if (!(element->tag->model & CM_MIXED)) { TY_(InlineDup)( doc, NULL ); } /*\ * Issue #212 - If it is likely that it may be necessary * to move a leading space into a text node before this * element, then keep the mode MixedContent to keep any * leading space \*/ if ( !(element->tag->model & CM_INLINE) || (element->tag->model & CM_FIELD ) ) { DEBUG_LOG_GET_OLD_MODE; mode = IgnoreWhitespace; DEBUG_LOG_CHANGE_MODE; } else if (mode == IgnoreWhitespace) { /* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace' when such a leading space may need to be inserted before this element to preserve the browser view */ DEBUG_LOG_GET_OLD_MODE; mode = MixedContent; DEBUG_LOG_CHANGE_MODE; } } /* Re-Entering */ /* Main Loop */ while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL) { DEBUG_LOG_GOT_TOKEN(node); /* end tag for this element */ if (node->type == EndTag && node->tag && (node->tag == element->tag || element->was == node->tag)) { TY_(FreeNode)( doc, node ); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) TY_(PopInline)( doc, NULL ); lexer->istackbase = istackbase; } element->closed = yes; TrimSpaces( doc, element ); DEBUG_LOG_EXIT; return NULL; } if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) ) { if ( TY_(nodeIsElement)(node) ) TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } if (node->type == EndTag) { if (node->tag == NULL) { TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } else if ( nodeIsBR(node) ) { node->type = StartTag; } else if ( nodeIsP(node) ) { /* Cannot have a block inside a paragraph, so no checking for an ancestor is necessary -- but we _can_ have paragraphs inside a block, so change it to an implicit empty paragraph, to be dealt with according to the user's options */ node->type = StartEndTag; node->implicit = yes; } else if (DescendantOf( element, node->tag->id )) { /* if this is the end tag for an ancestor element then infer end tag for this element */ TY_(UngetToken)( doc ); break; } else { /* special case etc. for stuff moved in front of table */ if ( lexer->exiled && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) { TY_(UngetToken)( doc ); TrimSpaces( doc, element ); DEBUG_LOG_EXIT; return NULL; } } } /* mixed content model permits text */ if (TY_(nodeIsText)(node)) { if ( checkstack ) { checkstack = no; if (!(element->tag->model & CM_MIXED)) { if ( TY_(InlineDup)(doc, node) > 0 ) continue; } } TY_(InsertNodeAtEnd)(element, node); DEBUG_LOG_GET_OLD_MODE mode = MixedContent; DEBUG_LOG_CHANGE_MODE; /* HTML4 strict doesn't allow mixed content for elements with %block; as their content model */ /* But only body, map, blockquote, form and noscript have content model %block; */ if ( nodeIsBODY(element) || nodeIsMAP(element) || nodeIsBLOCKQUOTE(element) || nodeIsFORM(element) || nodeIsNOSCRIPT(element) ) TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); continue; } if ( InsertMisc(element, node) ) continue; /* allow PARAM elements? */ if ( nodeIsPARAM(node) ) { if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) ) { TY_(InsertNodeAtEnd)(element, node); continue; } /* otherwise discard it */ TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* allow AREA elements? */ if ( nodeIsAREA(node) ) { if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) ) { TY_(InsertNodeAtEnd)(element, node); continue; } /* otherwise discard it */ TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* ignore unknown start/end tags */ if ( node->tag == NULL ) { TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* Allow CM_INLINE elements here. Allow CM_BLOCK elements here unless lexer->excludeBlocks is yes. LI and DD are special cased. Otherwise infer end tag for this element. */ if ( !TY_(nodeHasCM)(node, CM_INLINE) ) { if ( !TY_(nodeIsElement)(node) ) { if ( nodeIsFORM(node) ) BadForm( doc ); TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* #427671 - Fix by Randy Waki - 10 Aug 00 */ /* If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION start tag, discard the start tag and let the subsequent content get parsed as content of the enclosing LI. This seems to mimic IE and Netscape, and avoids an infinite loop: without this check, ParseBlock (which is parsing the LI's content) and ParseList (which is parsing the LI's parent's content) repeatedly defer to each other to parse the illegal start tag, each time inferring a missing or

  • respectively. NOTE: This check is a bit fragile. It specifically checks for the four tags that happen to weave their way through the current series of tests performed by ParseBlock and ParseList to trigger the infinite loop. */ if ( nodeIsLI(element) ) { if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) || nodeIsOPTGROUP(node) || nodeIsOPTION(node) ) { TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); /* DSR - 27Apr02 avoid memory leak */ continue; } } if ( nodeIsTD(element) || nodeIsTH(element) ) { /* if parent is a table cell, avoid inferring the end of the cell */ if ( TY_(nodeHasCM)(node, CM_HEAD) ) { MoveToHead( doc, element, node ); continue; } if ( TY_(nodeHasCM)(node, CM_LIST) ) { TY_(UngetToken)( doc ); node = TY_(InferredTag)(doc, TidyTag_UL); AddClassNoIndent(doc, node); lexer->excludeBlocks = yes; } else if ( TY_(nodeHasCM)(node, CM_DEFLIST) ) { TY_(UngetToken)( doc ); node = TY_(InferredTag)(doc, TidyTag_DL); lexer->excludeBlocks = yes; } /* infer end of current table cell */ if ( !TY_(nodeHasCM)(node, CM_BLOCK) ) { TY_(UngetToken)( doc ); TrimSpaces( doc, element ); DEBUG_LOG_EXIT; return NULL; } } else if ( TY_(nodeHasCM)(node, CM_BLOCK) ) { if ( lexer->excludeBlocks ) { if ( !TY_(nodeHasCM)(element, CM_OPT) ) TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); TY_(UngetToken)( doc ); if ( TY_(nodeHasCM)(element, CM_OBJECT) ) lexer->istackbase = istackbase; TrimSpaces( doc, element ); DEBUG_LOG_EXIT; return NULL; } } else if ( ! nodeIsTEMPLATE( element ) )/* things like list items */ { if (node->tag->model & CM_HEAD) { MoveToHead( doc, element, node ); continue; } /* special case where a form start tag occurs in a tr and is followed by td or th */ if ( nodeIsFORM(element) && nodeIsTD(element->parent) && element->parent->implicit ) { if ( nodeIsTD(node) ) { TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } if ( nodeIsTH(node) ) { TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); node = element->parent; TidyDocFree(doc, node->element); node->element = TY_(tmbstrdup)(doc->allocator, "th"); node->tag = TY_(LookupTagDef)( TidyTag_TH ); continue; } } if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit ) TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); /* #521, warn on missing optional end-tags if not omitting them. */ if ( cfgBool( doc, TidyOmitOptionalTags ) == no && TY_(nodeHasCM)(element, CM_OPT) ) TY_(Report)(doc, element, node, MISSING_ENDTAG_OPTIONAL ); TY_(UngetToken)( doc ); if ( TY_(nodeHasCM)(node, CM_LIST) ) { if ( element->parent && element->parent->tag && element->parent->tag->parser == TY_(ParseList) ) { TrimSpaces( doc, element ); DEBUG_LOG_EXIT; return NULL; } node = TY_(InferredTag)(doc, TidyTag_UL); AddClassNoIndent(doc, node); } else if ( TY_(nodeHasCM)(node, CM_DEFLIST) ) { if ( nodeIsDL(element->parent) ) { TrimSpaces( doc, element ); DEBUG_LOG_EXIT; return NULL; } node = TY_(InferredTag)(doc, TidyTag_DL); } else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) ) { /* http://tidy.sf.net/issue/1316307 */ /* In exiled mode, return so table processing can continue. */ if (lexer->exiled) { DEBUG_LOG_EXIT; return NULL; } node = TY_(InferredTag)(doc, TidyTag_TABLE); } else if ( TY_(nodeHasCM)(element, CM_OBJECT) ) { /* pop inline stack */ while ( lexer->istacksize > lexer->istackbase ) TY_(PopInline)( doc, NULL ); lexer->istackbase = istackbase; TrimSpaces( doc, element ); DEBUG_LOG_EXIT; return NULL; } else { TrimSpaces( doc, element ); DEBUG_LOG_EXIT; return NULL; } } } /*\ * Issue #307 - an tag to ends any open element * Like #427827 - fixed by Randy Waki and Bjoern Hoehrmann 23 Aug 00 * in ParseInline(), fix copied HERE to ParseBlock() * href: http://www.w3.org/TR/html-markup/a.html * The interactive element a must not appear as a descendant of the a element. \*/ if ( nodeIsA(node) && !node->implicit && (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) { if (node->type != EndTag && node->attributes == NULL && cfgBool(doc, TidyCoerceEndTags) ) { node->type = EndTag; TY_(Report)(doc, element, node, COERCE_TO_ENDTAG); TY_(UngetToken)( doc ); continue; } if (nodeIsA(element)) { TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); TY_(UngetToken)( doc ); } else { /* Issue #597 - if we not 'UngetToken' then it is being discarded. Add message, and 'FreeNode' - thanks @ralfjunker */ TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)(doc, node); } if (!(mode & Preformatted)) TrimSpaces(doc, element); DEBUG_LOG_EXIT; return NULL; } /* parse known element */ if (TY_(nodeIsElement)(node)) { if (node->tag->model & CM_INLINE) { if (checkstack && !node->implicit) { checkstack = no; if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */ { if ( TY_(InlineDup)(doc, node) > 0 ) continue; } } DEBUG_LOG_GET_OLD_MODE; mode = MixedContent; DEBUG_LOG_CHANGE_MODE; } else { checkstack = yes; DEBUG_LOG_GET_OLD_MODE; mode = IgnoreWhitespace; DEBUG_LOG_CHANGE_MODE; } /* trim white space before
    */ if ( nodeIsBR(node) ) TrimSpaces( doc, element ); TY_(InsertNodeAtEnd)(element, node); if (node->implicit) TY_(Report)(doc, element, node, INSERTING_TAG ); /* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an effort has been made above to set a 'MixedContent' mode in some cases? WHY IS THE 'mode' VARIABLE NOT USED HERE???? */ { TidyParserMemory memory = {0}; memory.identity = TY_(ParseBlock); memory.reentry_node = node; memory.reentry_mode = mode; memory.original_node = element; TY_(pushMemory)(doc, memory); DEBUG_LOG_EXIT_WITH_NODE(node); } return node; } /* discard unexpected tags */ if (node->type == EndTag) TY_(PopInline)( doc, node ); /* if inline end tag */ TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } if (!(element->tag->model & CM_OPT)) TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while ( lexer->istacksize > lexer->istackbase ) TY_(PopInline)( doc, NULL ); lexer->istackbase = istackbase; } TrimSpaces( doc, element ); DEBUG_LOG_EXIT; return NULL; } /** MARK: TY_(ParseBody) * Parses the `body` tag. * * This is a non-recursing parser. It uses the document's parser memory stack * to send subsequent nodes back to the controller for dispatching to parsers. * This parser is also re-enterable, so that post-processing can occur after * such dispatching. */ Node* TY_(ParseBody)( TidyDocImpl* doc, Node *body, GetTokenMode mode ) { Lexer* lexer = doc->lexer; Node *node = NULL; Bool checkstack = no; Bool iswhitenode = no; DEBUG_LOG_COUNTERS; mode = IgnoreWhitespace; checkstack = yes; /* If we're re-entering, then we need to setup from a previous state, instead of starting fresh. We can pull what we need from the document's stack. */ if ( body == NULL ) { TidyParserMemory memory = TY_(popMemory)( doc ); node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ DEBUG_LOG_REENTER_WITH_NODE(node); body = memory.original_node; checkstack = memory.register_1; iswhitenode = memory.register_2; DEBUG_LOG_GET_OLD_MODE; mode = memory.mode; DEBUG_LOG_CHANGE_MODE; } else { DEBUG_LOG_ENTER_WITH_NODE(body); TY_(BumpObject)( doc, body->parent ); } while ((node = TY_(GetToken)(doc, mode)) != NULL) { DEBUG_LOG_GOT_TOKEN(node); /* find and discard multiple elements */ if (node->tag == body->tag && node->type == StartTag) { TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); TY_(FreeNode)(doc, node); continue; } /* #538536 Extra endtags not detected */ if ( nodeIsHTML(node) ) { if (TY_(nodeIsElement)(node) || lexer->seenEndHtml) TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); else lexer->seenEndHtml = 1; TY_(FreeNode)( doc, node); continue; } if ( lexer->seenEndBody && ( node->type == StartTag || node->type == EndTag || node->type == StartEndTag ) ) { TY_(Report)(doc, body, node, CONTENT_AFTER_BODY ); } if ( node->tag == body->tag && node->type == EndTag ) { body->closed = yes; TrimSpaces(doc, body); TY_(FreeNode)( doc, node); lexer->seenEndBody = 1; DEBUG_LOG_GET_OLD_MODE; mode = IgnoreWhitespace; DEBUG_LOG_CHANGE_MODE; if ( nodeIsNOFRAMES(body->parent) ) break; continue; } if ( nodeIsNOFRAMES(node) ) { if (node->type == StartTag) { TidyParserMemory memory = {0}; TY_(InsertNodeAtEnd)(body, node); memory.identity = TY_(ParseBody); memory.original_node = body; memory.reentry_node = node; memory.register_1 = checkstack; memory.register_2 = iswhitenode; memory.mode = mode; TY_(pushMemory)( doc, memory ); DEBUG_LOG_EXIT_WITH_NODE(node); return node; } if (node->type == EndTag && nodeIsNOFRAMES(body->parent) ) { TrimSpaces(doc, body); TY_(UngetToken)( doc ); break; } } if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node)) && nodeIsNOFRAMES(body->parent) ) { TrimSpaces(doc, body); TY_(UngetToken)( doc ); break; } iswhitenode = no; if ( TY_(nodeIsText)(node) && node->end <= node->start + 1 && lexer->lexbuf[node->start] == ' ' ) iswhitenode = yes; /* deal with comments etc. */ if (InsertMisc(body, node)) continue; /* mixed content model permits text */ if (TY_(nodeIsText)(node)) { if (iswhitenode && mode == IgnoreWhitespace) { TY_(FreeNode)( doc, node); continue; } /* HTML 2 and HTML4 strict don't allow text here */ TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20)); if (checkstack) { checkstack = no; if ( TY_(InlineDup)(doc, node) > 0 ) continue; } TY_(InsertNodeAtEnd)(body, node); DEBUG_LOG_GET_OLD_MODE; mode = MixedContent; DEBUG_LOG_CHANGE_MODE; continue; } if (node->type == DocTypeTag) { InsertDocType(doc, body, node); continue; } /* discard unknown and PARAM tags */ if ( node->tag == NULL || nodeIsPARAM(node) ) { TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } /* Netscape allows LI and DD directly in BODY We infer UL or DL respectively and use this Bool to exclude block-level elements so as to match Netscape's observed behaviour. */ lexer->excludeBlocks = no; if ((( nodeIsINPUT(node) || (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE)) ) && !TY_(IsHTML5Mode)(doc)) || nodeIsLI(node) ) { /* avoid this error message being issued twice */ if (!(node->tag->model & CM_HEAD)) TY_(Report)(doc, body, node, TAG_NOT_ALLOWED_IN); if (node->tag->model & CM_HTML) { /* copy body attributes if current body was inferred */ if ( nodeIsBODY(node) && body->implicit && body->attributes == NULL ) { body->attributes = node->attributes; node->attributes = NULL; } TY_(FreeNode)( doc, node); continue; } if (node->tag->model & CM_HEAD) { MoveToHead(doc, body, node); continue; } if (node->tag->model & CM_LIST) { TY_(UngetToken)( doc ); node = TY_(InferredTag)(doc, TidyTag_UL); AddClassNoIndent(doc, node); lexer->excludeBlocks = yes; } else if (node->tag->model & CM_DEFLIST) { TY_(UngetToken)( doc ); node = TY_(InferredTag)(doc, TidyTag_DL); lexer->excludeBlocks = yes; } else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW)) { /* http://tidy.sf.net/issue/2855621 */ if (node->type != EndTag) { TY_(UngetToken)( doc ); node = TY_(InferredTag)(doc, TidyTag_TABLE); } lexer->excludeBlocks = yes; } else if ( nodeIsINPUT(node) ) { TY_(UngetToken)( doc ); node = TY_(InferredTag)(doc, TidyTag_FORM); lexer->excludeBlocks = yes; } else { if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) ) { TY_(UngetToken)( doc ); DEBUG_LOG_EXIT; return NULL; } /* ignore