/* parser.c -- HTML Parser (c) 1998-2007 (W3C) MIT, ERCIM, Keio University See tidy.h for the copyright notice. */ #include "tidy-int.h" #include "lexer.h" #include "parser.h" #include "message.h" #include "clean.h" #include "tags.h" #include "tmbstr.h" #ifdef _MSC_VER #include "sprtf.h" #endif #ifndef SPRTF #define SPRTF printf #endif #ifdef AUTO_INPUT_ENCODING #include "charsets.h" #endif Bool TY_(CheckNodeIntegrity)(Node *node) { #ifndef NO_NODE_INTEGRITY_CHECK Node *child; if (node->prev) { if (node->prev->next != node) return no; } if (node->next) { if (node->next == node || node->next->prev != node) return no; } if (node->parent) { if (node->prev == NULL && node->parent->content != node) return no; if (node->next == NULL && node->parent->last != node) return no; } for (child = node->content; child; child = child->next) if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) ) return no; #endif return yes; } /* used to determine how attributes without values should be printed this was introduced to deal with user defined tags e.g. Cold Fusion */ Bool TY_(IsNewNode)(Node *node) { if (node && node->tag) { return (node->tag->model & CM_NEW); } return yes; } void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected) { const Dict* tag = TY_(LookupTagDef)(tid); Node* tmp = TY_(InferredTag)(doc, tag->id); if (obsolete) TY_(ReportWarning)(doc, node, tmp, OBSOLETE_ELEMENT); else if (unexpected) TY_(ReportError)(doc, node, tmp, REPLACING_UNEX_ELEMENT); else TY_(ReportNotice)(doc, node, tmp, REPLACING_ELEMENT); TidyDocFree(doc, tmp->element); TidyDocFree(doc, tmp); node->was = node->tag; node->tag = tag; node->type = StartTag; node->implicit = yes; TidyDocFree(doc, node->element); node->element = TY_(tmbstrdup)(doc->allocator, tag->name); } /* extract a node and its children from a markup tree */ Node *TY_(RemoveNode)(Node *node) { if (node->prev) node->prev->next = node->next; if (node->next) node->next->prev = node->prev; if (node->parent) { if (node->parent->content == node) node->parent->content = node->next; if (node->parent->last == node) node->parent->last = node->prev; } node->parent = node->prev = node->next = NULL; return node; } /* remove node from markup tree and discard it */ Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element ) { Node *next = NULL; if (element) { next = element->next; TY_(RemoveNode)(element); TY_(FreeNode)( doc, element); } return next; } /* insert "node" into markup tree as the firt element of content of "element" */ void TY_(InsertNodeAtStart)(Node *element, Node *node) { node->parent = element; if (element->content == NULL) element->last = node; else element->content->prev = node; node->next = element->content; node->prev = NULL; element->content = node; } /* insert "node" into markup tree as the last element of content of "element" */ void TY_(InsertNodeAtEnd)(Node *element, Node *node) { node->parent = element; node->prev = element->last; if (element->last != NULL) element->last->next = node; else element->content = node; element->last = node; } /* insert "node" into markup tree in place of "element" which is moved to become the child of the node */ static void InsertNodeAsParent(Node *element, Node *node) { node->content = element; node->last = element; node->parent = element->parent; element->parent = node; if (node->parent->content == element) node->parent->content = node; if (node->parent->last == element) node->parent->last = node; node->prev = element->prev; element->prev = NULL; if (node->prev) node->prev->next = node; node->next = element->next; element->next = NULL; if (node->next) node->next->prev = node; } /* insert "node" into markup tree before "element" */ void TY_(InsertNodeBeforeElement)(Node *element, Node *node) { Node *parent; parent = element->parent; node->parent = parent; node->next = element; node->prev = element->prev; element->prev = node; if (node->prev) node->prev->next = node; if (parent->content == element) parent->content = node; } /* insert "node" into markup tree after "element" */ void TY_(InsertNodeAfterElement)(Node *element, Node *node) { Node *parent; parent = element->parent; node->parent = parent; /* AQ - 13 Jan 2000 fix for parent == NULL */ if (parent != NULL && parent->last == element) parent->last = node; else { node->next = element->next; /* AQ - 13 Jan 2000 fix for node->next == NULL */ if (node->next != NULL) node->next->prev = node; } element->next = node; node->prev = element; } static Bool CanPrune( TidyDocImpl* doc, Node *element ) { if ( !cfgBool(doc, TidyDropEmptyElems) ) return no; if ( TY_(nodeIsText)(element) ) return yes; if ( element->content ) return no; if ( element->tag == NULL ) return no; if ( element->tag->model & CM_BLOCK && element->attributes != NULL ) return no; if ( nodeIsA(element) && element->attributes != NULL ) return no; if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) ) return no; if ( element->tag->model & CM_ROW ) return no; if ( element->tag->model & CM_EMPTY ) return no; if ( nodeIsAPPLET(element) ) return no; if ( nodeIsOBJECT(element) ) return no; if ( nodeIsSCRIPT(element) && attrGetSRC(element) ) return no; if ( nodeIsTITLE(element) ) return no; /* #433359 - fix by Randy Waki 12 Mar 01 */ if ( nodeIsIFRAME(element) ) return no; /* fix for bug 770297 */ if (nodeIsTEXTAREA(element)) return no; /* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */ if (nodeIsCANVAS(element)) return no; if (nodeIsPROGRESS(element)) return no; if ( attrGetID(element) || attrGetNAME(element) ) return no; /* fix for bug 695408; a better fix would look for unknown and */ /* known proprietary attributes that make the element significant */ if (attrGetDATAFLD(element)) return no; /* fix for bug 723772, don't trim new-...-tags */ if (element->tag->id == TidyTag_UNKNOWN) return no; if (nodeIsBODY(element)) return no; if (nodeIsCOLGROUP(element)) return no; /* HTML5 - do NOT drop empty option if it has attributes */ if ( nodeIsOPTION(element) && element->attributes != NULL ) return no; /* fix for #103 - don't drop empty dd tags lest document not validate */ if (nodeIsDD(element)) return no; return yes; } /* return next element */ Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element ) { if ( CanPrune(doc, element) ) { if (element->type != TextNode) TY_(ReportNotice)(doc, element, NULL, TRIM_EMPTY_ELEMENT); return TY_(DiscardElement)(doc, element); } return element->next; } Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node) { Node* next; while (node) { next = node->next; if (node->content) TY_(DropEmptyElements)(doc, node->content); if (!TY_(nodeIsElement)(node) && !(TY_(nodeIsText)(node) && !(node->start < node->end))) { node = next; continue; } next = TY_(TrimEmptyElement)(doc, node); node = next; } return node; } /* errors in positioning of form start or end tags generally require human intervention to fix */ static void BadForm( TidyDocImpl* doc ) { doc->badForm = yes; /* doc->errors++; */ } /* This maps hello world to hello world If last child of element is a text node then trim trailing white space character moving it to after element's end tag. */ static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last ) { Lexer* lexer = doc->lexer; byte c; if (TY_(nodeIsText)(last)) { if (last->end > last->start) { c = (byte) lexer->lexbuf[ last->end - 1 ]; if ( c == ' ' #ifdef COMMENT_NBSP_FIX || c == 160 #endif ) { #ifdef COMMENT_NBSP_FIX /* take care with
hello world to
hello world Trims initial space, by moving it before the start tag, or if this element is the first in parent's content, then by discarding the space */ static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text ) { Lexer* lexer = doc->lexer; Node *prev, *node; if ( TY_(nodeIsText)(text) && lexer->lexbuf[text->start] == ' ' && text->start < text->end ) { if ( (element->tag->model & CM_INLINE) && !(element->tag->model & CM_FIELD) ) { prev = element->prev; if (TY_(nodeIsText)(prev)) { if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ') lexer->lexbuf[(prev->end)++] = ' '; ++(element->start); } else /* create new node */ { node = TY_(NewNode)(lexer->allocator, lexer); node->start = (element->start)++; node->end = element->start; lexer->lexbuf[node->start] = ' '; TY_(InsertNodeBeforeElement)(element ,node); } } /* discard the space in current node */ ++(text->start); } } static Bool IsPreDescendant(Node* node) { Node *parent = node->parent; while (parent) { if (parent->tag && parent->tag->parser == TY_(ParsePre)) return yes; parent = parent->parent; } return no; } static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) { Node* next; if (!TY_(nodeIsText)(node)) return no; if (node->parent->type == DocTypeTag) return no; if (IsPreDescendant(node)) return no; if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) return no; next = node->next; /*
...
*/ if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE)) return yes; /*...
*/ if (next->type == StartTag) return yes; /* ......
......
...
*/ if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE)) return yes; /*...
*/ if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE)) return yes; return no; } static void CleanSpaces(TidyDocImpl* doc, Node* node) { Node* next; while (node) { next = node->next; if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node)) while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start])) ++(node->start); if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node)) while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1])) --(node->end); if (TY_(nodeIsText)(node) && !(node->start < node->end)) { TY_(RemoveNode)(node); TY_(FreeNode)(doc, node); node = next; continue; } if (node->content) CleanSpaces(doc, node->content); node = next; } } /* Move initial and trailing space out. This routine maps: hello world to hello world and hello world to hello world */ static void TrimSpaces( TidyDocImpl* doc, Node *element) { Node* text = element->content; if (nodeIsPRE(element) || IsPreDescendant(element)) return; if (TY_(nodeIsText)(text)) TrimInitialSpace(doc, element, text); text = element->last; if (TY_(nodeIsText)(text)) TrimTrailingSpace(doc, element, text); } static Bool DescendantOf( Node *element, TidyTagId tid ) { Node *parent; for ( parent = element->parent; parent != NULL; parent = parent->parent ) { if ( TagIsId(parent, tid) ) return yes; } return no; } static Bool InsertMisc(Node *element, Node *node) { if (node->type == CommentTag || node->type == ProcInsTag || node->type == CDATATag || node->type == SectionTag || node->type == AspTag || node->type == JsteTag || node->type == PhpTag ) { TY_(InsertNodeAtEnd)(element, node); return yes; } if ( node->type == XmlDecl ) { Node* root = element; while ( root && root->parent ) root = root->parent; if ( root && !(root->content && root->content->type == XmlDecl)) { TY_(InsertNodeAtStart)( root, node ); return yes; } } /* Declared empty tags seem to be slipping through ** the cracks. This is an experiment to figure out ** a decent place to pick them up. */ if ( node->tag && TY_(nodeIsElement)(node) && TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN && (node->tag->versions & VERS_PROPRIETARY) != 0 ) { TY_(InsertNodeAtEnd)(element, node); return yes; } return no; } static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode ) { Lexer* lexer = doc->lexer; /* Fix by GLP 2000-12-21. Need to reset insertspace if this is both a non-inline and empty tag (base, link, meta, isindex, hr, area). */ if (node->tag->model & CM_EMPTY) { lexer->waswhite = no; if (node->tag->parser == NULL) return; } else if (!(node->tag->model & CM_INLINE)) lexer->insertspace = no; if (node->tag->parser == NULL) return; if (node->type == StartEndTag) return; (*node->tag->parser)( doc, node, mode ); } /* the doctype has been found after other tags, and needs moving to before the html element */ static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype ) { Node* existing = TY_(FindDocType)( doc ); if ( existing ) { TY_(ReportError)(doc, element, doctype, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, doctype ); } else { TY_(ReportError)(doc, element, doctype, DOCTYPE_AFTER_TAGS ); while ( !nodeIsHTML(element) ) element = element->parent; TY_(InsertNodeBeforeElement)( element, doctype ); } } /* move node to the head, where element is used as starting point in hunt for head. normally called during parsing */ static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node ) { Node *head; TY_(RemoveNode)( node ); /* make sure that node is isolated */ if ( TY_(nodeIsElement)(node) ) { TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN ); head = TY_(FindHEAD)(doc); assert(head != NULL); TY_(InsertNodeAtEnd)(head, node); if ( node->tag->parser ) ParseTag( doc, node, IgnoreWhitespace ); } else { TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node ); } } /* moves given node to end of body element */ static void MoveNodeToBody( TidyDocImpl* doc, Node* node ) { Node* body = TY_(FindBody)( doc ); if ( body ) { TY_(RemoveNode)( node ); TY_(InsertNodeAtEnd)( body, node ); } } static void AddClassNoIndent( TidyDocImpl* doc, Node *node ) { ctmbstr sprop = "padding-left: 2ex; margin-left: 0ex" "; margin-top: 0ex; margin-bottom: 0ex"; if ( !cfgBool(doc, TidyDecorateInferredUL) ) return; if ( cfgBool(doc, TidyMakeClean) ) TY_(AddStyleAsClass)( doc, node, sprop ); else TY_(AddStyleProperty)( doc, node, sprop ); } /* element is node created by the lexer upon seeing the start tag, or by the parser when the start tag is inferred */ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) { #if !defined(NDEBUG) && defined(_MSC_VER) static int in_parse_block = 0; #endif Lexer* lexer = doc->lexer; Node *node; Bool checkstack = yes; uint istackbase = 0; #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_block++; SPRTF("Entering ParseBlock %d...\n",in_parse_block); #endif if ( element->tag->model & CM_EMPTY ) { #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_block--; SPRTF("Exit ParseBlockL 1 %d...\n",in_parse_block); #endif return; } if ( nodeIsFORM(element) && DescendantOf(element, TidyTag_FORM) ) TY_(ReportError)(doc, element, NULL, ILLEGAL_NESTING ); /* InlineDup() asks the lexer to insert inline emphasis tags currently pushed on the istack, but take care to avoid propagating inline emphasis inside OBJECT or APPLET. For these elements a fresh inline stack context is created and disposed of upon reaching the end of the element. They thus behave like table cells in this respect. */ if (element->tag->model & CM_OBJECT) { istackbase = lexer->istackbase; lexer->istackbase = lexer->istacksize; } if (!(element->tag->model & CM_MIXED)) TY_(InlineDup)( doc, NULL ); mode = IgnoreWhitespace; while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL) { /* end tag for this element */ if (node->type == EndTag && node->tag && (node->tag == element->tag || element->was == node->tag)) { TY_(FreeNode)( doc, node ); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) TY_(PopInline)( doc, NULL ); lexer->istackbase = istackbase; } element->closed = yes; TrimSpaces( doc, element ); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_block--; SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block); #endif return; } if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD )) { /* If we're in the HEAD, close it before proceeding. This is an extremely rare occurance, but has been observed. */ TY_(UngetToken)( doc ); break; } if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) ) { if ( TY_(nodeIsElement)(node) ) TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } if (node->type == EndTag) { if (node->tag == NULL) { TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } else if ( nodeIsBR(node) ) node->type = StartTag; else if ( nodeIsP(node) ) { /* Cannot have a block inside a paragraph, so no checking for an ancestor is necessary -- but we _can_ have paragraphs inside a block, so change it to an implicit empty paragraph, to be dealt with according to the user's options */ node->type = StartEndTag; node->implicit = yes; #if OBSOLETE TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */ TY_(InsertNodeAtEnd)( element, node ); node = InferredTag(doc, TidyTag_BR); #endif } else if (DescendantOf( element, node->tag->id )) { /* if this is the end tag for an ancestor element then infer end tag for this element */ TY_(UngetToken)( doc ); break; #if OBSOLETE Node *parent; for ( parent = element->parent; parent != NULL; parent = parent->parent ) { if (node->tag == parent->tag) { if (!(element->tag->model & CM_OPT)) TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE ); TY_(UngetToken)( doc ); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) TY_(PopInline)( doc, NULL ); lexer->istackbase = istackbase; } TrimSpaces( doc, element ); return; } } #endif } else { /* special case etc. for stuff moved in front of table */ if ( lexer->exiled && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) { TY_(UngetToken)( doc ); TrimSpaces( doc, element ); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_block--; SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block); #endif return; } } } /* mixed content model permits text */ if (TY_(nodeIsText)(node)) { if ( checkstack ) { checkstack = no; if (!(element->tag->model & CM_MIXED)) { if ( TY_(InlineDup)(doc, node) > 0 ) continue; } } TY_(InsertNodeAtEnd)(element, node); mode = MixedContent; /* HTML4 strict doesn't allow mixed content for elements with %block; as their content model */ /* But only body, map, blockquote, form and noscript have content model %block; */ if ( nodeIsBODY(element) || nodeIsMAP(element) || nodeIsBLOCKQUOTE(element) || nodeIsFORM(element) || nodeIsNOSCRIPT(element) ) TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); continue; } if ( InsertMisc(element, node) ) continue; /* allow PARAM elements? */ if ( nodeIsPARAM(node) ) { if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) ) { TY_(InsertNodeAtEnd)(element, node); continue; } /* otherwise discard it */ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* allow AREA elements? */ if ( nodeIsAREA(node) ) { if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) ) { TY_(InsertNodeAtEnd)(element, node); continue; } /* otherwise discard it */ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* ignore unknown start/end tags */ if ( node->tag == NULL ) { TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* Allow CM_INLINE elements here. Allow CM_BLOCK elements here unless lexer->excludeBlocks is yes. LI and DD are special cased. Otherwise infer end tag for this element. */ if ( !TY_(nodeHasCM)(node, CM_INLINE) ) { if ( !TY_(nodeIsElement)(node) ) { if ( nodeIsFORM(node) ) BadForm( doc ); TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* #427671 - Fix by Randy Waki - 10 Aug 00 */ /* If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION start tag, discard the start tag and let the subsequent content get parsed as content of the enclosing LI. This seems to mimic IE and Netscape, and avoids an infinite loop: without this check, ParseBlock (which is parsing the LI's content) and ParseList (which is parsing the LI's parent's content) repeatedly defer to each other to parse the illegal start tag, each time inferring a missing or111222333444555
111222333444555
111222333444555
mapto
*/ if ( nodeIsP(node) && node->type == StartTag && ( (mode & Preformatted) || nodeIsDT(element) || DescendantOf(element, TidyTag_DT ) ) ) { node->tag = TY_(LookupTagDef)( TidyTag_BR ); TidyDocFree(doc, node->element); node->element = TY_(tmbstrdup)(doc->allocator, "br"); TrimSpaces(doc, element); TY_(InsertNodeAtEnd)(element, node); continue; } /*allowed within
in HTML 4.01 Transitional */ if ( nodeIsP(node) && node->type == StartTag && nodeIsADDRESS(element) ) { TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); TY_(InsertNodeAtEnd)(element, node); (*node->tag->parser)( doc, node, mode ); continue; } /* ignore unknown and PARAM tags */ if ( node->tag == NULL || nodeIsPARAM(node) ) { TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node ); continue; } if ( nodeIsBR(node) && node->type == EndTag ) node->type = StartTag; if ( node->type == EndTag ) { /* coerce to
*/ if ( nodeIsBR(node) ) node->type = StartTag; else if ( nodeIsP(node) ) { /* coerce unmatched to
*/ if ( !DescendantOf(element, TidyTag_P) ) { TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); TrimSpaces( doc, element ); TY_(InsertNodeAtEnd)( element, node ); node = TY_(InferredTag)(doc, TidyTag_BR); TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */ continue; } } else if ( TY_(nodeHasCM)(node, CM_INLINE) && !nodeIsA(node) && !TY_(nodeHasCM)(node, CM_OBJECT) && TY_(nodeHasCM)(element, CM_INLINE) ) { /* allow any inline end tag to end current element */ /* http://tidy.sf.net/issue/1426419 */ /* but, like the browser, retain an earlier inline element. This is implemented by setting the lexer into a mode where it gets tokens from the inline stack rather than from the input stream. Check if the scenerio fits. */ if ( !nodeIsA(element) && (node->tag != element->tag) && TY_(IsPushed)( doc, node ) && TY_(IsPushed)( doc, element ) ) { /* we have something like bold bold and italic italics */ if ( TY_(SwitchInline)( doc, element, node ) ) { TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG); TY_(UngetToken)( doc ); /* put this back */ TY_(InlineDup1)( doc, NULL, element ); /* dupe the , after */ if (!(mode & Preformatted)) TrimSpaces( doc, element ); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_inline--; SPRTF("Exit ParseInline 4 %d...\n",in_parse_inline); #endif return; /* close , but will re-open it, after */ } } TY_(PopInline)( doc, element ); if ( !nodeIsA(element) ) { if ( nodeIsA(node) && node->tag != element->tag ) { TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE ); TY_(UngetToken)( doc ); } else { TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG); TY_(FreeNode)( doc, node); } if (!(mode & Preformatted)) TrimSpaces(doc, element); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_inline--; SPRTF("Exit ParseInline 5 %d...\n",in_parse_inline); #endif return; } /* if parent is then discard unexpected inline end tag */ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } /* special case etc. for stuff moved in front of table */ else if ( lexer->exiled && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) { TY_(UngetToken)( doc ); TrimSpaces(doc, element); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_inline--; SPRTF("Exit ParseInline 6 %d...\n",in_parse_inline); #endif return; } } /* allow any header tag to end current header */ if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) ) { if ( node->tag == element->tag ) { TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG ); TY_(FreeNode)( doc, node); } else { TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE ); TY_(UngetToken)( doc ); } if (!(mode & Preformatted)) TrimSpaces(doc, element); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_inline--; SPRTF("Exit ParseInline 7 %d...\n",in_parse_inline); #endif return; } /* an tag to ends any open element but is mapped to */ /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */ if ( nodeIsA(node) && !node->implicit && (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) { /* coerce to unless it has some attributes */ /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ /* other fixes by Dave Raggett */ /* if (node->attributes == NULL) */ if (node->type != EndTag && node->attributes == NULL && cfgBool(doc, TidyCoerceEndTags) ) { node->type = EndTag; TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG); /* TY_(PopInline)( doc, node ); */ TY_(UngetToken)( doc ); continue; } TY_(UngetToken)( doc ); TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE); /* TY_(PopInline)( doc, element ); */ if (!(mode & Preformatted)) TrimSpaces(doc, element); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_inline--; SPRTF("Exit ParseInline 8 %d...\n",in_parse_inline); #endif return; } if (element->tag->model & CM_HEADING) { if ( nodeIsCENTER(node) || nodeIsDIV(node) ) { if (!TY_(nodeIsElement)(node)) { TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN); /* insert center as parent if heading is empty */ if (element->content == NULL) { InsertNodeAsParent(element, node); continue; } /* split heading and make center parent of 2nd part */ TY_(InsertNodeAfterElement)(element, node); if (!(mode & Preformatted)) TrimSpaces(doc, element); element = TY_(CloneNode)( doc, element ); TY_(InsertNodeAtEnd)(node, element); continue; } if ( nodeIsHR(node) ) { if ( !TY_(nodeIsElement)(node) ) { TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN); /* insert hr before heading if heading is empty */ if (element->content == NULL) { TY_(InsertNodeBeforeElement)(element, node); continue; } /* split heading and insert hr before 2nd part */ TY_(InsertNodeAfterElement)(element, node); if (!(mode & Preformatted)) TrimSpaces(doc, element); element = TY_(CloneNode)( doc, element ); TY_(InsertNodeAfterElement)(node, element); continue; } } if ( nodeIsDT(element) ) { if ( nodeIsHR(node) ) { Node *dd; if ( !TY_(nodeIsElement)(node) ) { TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN); dd = TY_(InferredTag)(doc, TidyTag_DD); /* insert hr within dd before dt if dt is empty */ if (element->content == NULL) { TY_(InsertNodeBeforeElement)(element, dd); TY_(InsertNodeAtEnd)(dd, node); continue; } /* split dt and insert hr within dd before 2nd part */ TY_(InsertNodeAfterElement)(element, dd); TY_(InsertNodeAtEnd)(dd, node); if (!(mode & Preformatted)) TrimSpaces(doc, element); element = TY_(CloneNode)( doc, element ); TY_(InsertNodeAfterElement)(dd, element); continue; } } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { for (parent = element->parent; parent != NULL; parent = parent->parent) { if (node->tag == parent->tag) { if (!(element->tag->model & CM_OPT) && !element->implicit) TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE); if( TY_(IsPushedLast)( doc, element, node ) ) TY_(PopInline)( doc, element ); TY_(UngetToken)( doc ); if (!(mode & Preformatted)) TrimSpaces(doc, element); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_inline--; SPRTF("Exit ParseInline 9 %d...\n",in_parse_inline); #endif return; } } } /* block level tags end this element */ if (!(node->tag->model & CM_INLINE) && !(element->tag->model & CM_MIXED)) { if ( !TY_(nodeIsElement)(node) ) { TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } /* HTML5 */ if (nodeIsDATALIST(element)) { TY_(ConstrainVersion)( doc, ~VERS_HTML5 ); } else if (!(element->tag->model & CM_OPT)) TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE); if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK)) { MoveToHead(doc, element, node); continue; } /* prevent anchors from propagating into block tags except for headings h1 to h6 */ if ( nodeIsA(element) ) { if (node->tag && !(node->tag->model & CM_HEADING)) TY_(PopInline)( doc, element ); else if (!(element->content)) { TY_(DiscardElement)( doc, element ); TY_(UngetToken)( doc ); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_inline--; SPRTF("Exit ParseInline 10 %d...\n",in_parse_inline); #endif return; } } TY_(UngetToken)( doc ); if (!(mode & Preformatted)) TrimSpaces(doc, element); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_inline--; SPRTF("Exit ParseInline 11 %d...\n",in_parse_inline); #endif return; } /* parse inline element */ if (TY_(nodeIsElement)(node)) { if (node->implicit) TY_(ReportError)(doc, element, node, INSERTING_TAG); /* trim white space before
*/ if ( nodeIsBR(node) ) TrimSpaces(doc, element); TY_(InsertNodeAtEnd)(element, node); ParseTag(doc, node, mode); continue; } /* discard unexpected tags */ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node ); continue; } if (!(element->tag->model & CM_OPT)) TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_inline--; SPRTF("Exit ParseInline 12 %d...\n",in_parse_inline); #endif } void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode) { Lexer* lexer = doc->lexer; if ( lexer->isvoyager ) { Node *node = TY_(GetToken)( doc, mode); if ( node ) { if ( !(node->type == EndTag && node->tag == element->tag) ) { /* TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY); */ TY_(UngetToken)( doc ); } else { TY_(FreeNode)( doc, node ); } } } } void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode) { Lexer* lexer = doc->lexer; Node *node, *parent; if (list->tag->model & CM_EMPTY) return; lexer->insert = NULL; /* defer implicit inline start tags */ while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL) { if (node->tag == list->tag && node->type == EndTag) { TY_(FreeNode)( doc, node); list->closed = yes; return; } /* deal with comments etc. */ if (InsertMisc(list, node)) continue; if (TY_(nodeIsText)(node)) { TY_(UngetToken)( doc ); node = TY_(InferredTag)(doc, TidyTag_DT); TY_(ReportError)(doc, list, node, MISSING_STARTTAG); } if (node->tag == NULL) { TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { Bool discardIt = no; if ( nodeIsFORM(node) ) { BadForm( doc ); TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node ); continue; } for (parent = list->parent; parent != NULL; parent = parent->parent) { /* Do not match across BODY to avoid infinite loop between ParseBody and this parser, See http://tidy.sf.net/bug/1098012. */ if (nodeIsBODY(parent)) { discardIt = yes; break; } if (node->tag == parent->tag) { TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE); TY_(UngetToken)( doc ); return; } } if (discardIt) { TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } } /* center in a dt or a dl breaks the dl list in two */ if ( nodeIsCENTER(node) ) { if (list->content) TY_(InsertNodeAfterElement)(list, node); else /* trim empty dl list */ { TY_(InsertNodeBeforeElement)(list, node); /* #540296 tidy dumps with empty definition list */ #if 0 TY_(DiscardElement)(list); #endif } /* #426885 - fix by Glenn Carroll 19 Apr 00, and Gary Dechaines 11 Aug 00 */ /* ParseTag can destroy node, if it finds that * thisis followed immediately by . * It's awkward but necessary to determine if this * has happened. */ parent = node->parent; /* and parse contents of center */ lexer->excludeBlocks = no; ParseTag( doc, node, mode); lexer->excludeBlocks = yes; /* now create a new dl element, * unless node has been blown away because the * center was empty, as above. */ if (parent->last == node) { list = TY_(InferredTag)(doc, TidyTag_DL); TY_(InsertNodeAfterElement)(node, list); } continue; } if ( !(nodeIsDT(node) || nodeIsDD(node)) ) { TY_(UngetToken)( doc ); if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) { TY_(ReportError)(doc, list, node, TAG_NOT_ALLOWED_IN); return; } /* if DD appeared directly in BODY then exclude blocks */ if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks) return; node = TY_(InferredTag)(doc, TidyTag_DD); TY_(ReportError)(doc, list, node, MISSING_STARTTAG); } if (node->type == EndTag) { TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } /* node should be
or | */ TY_(InsertNodeAtEnd)(row, node); exclude_state = lexer->excludeBlocks; lexer->excludeBlocks = no; ParseTag( doc, node, IgnoreWhitespace); lexer->excludeBlocks = exclude_state; /* pop inline stack */ while ( lexer->istacksize > lexer->istackbase ) TY_(PopInline)( doc, NULL ); } } void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode)) { Lexer* lexer = doc->lexer; Node *node, *parent; if (rowgroup->tag->model & CM_EMPTY) return; while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) { if (node->tag == rowgroup->tag) { if (node->type == EndTag) { rowgroup->closed = yes; TY_(FreeNode)( doc, node); return; } TY_(UngetToken)( doc ); return; } /* if |
---|
into each disallowed child whereis allowed in order to replicate some browsers behaivour, but there are a lot of exceptions, e.g. Internet Explorer does not propagateinto table cells while Mozilla does. Opera 6 never propagatesinto blocklevel elements while Opera 7 behaves much like Mozilla. Tidy behaves thus mostly like Opera 6 except for nestedelements which are handled like Mozilla takes them (Opera6 closes allafter the first). There are similar issues like replacingin
with
, for example...(Input)...
...(Tidy)
......(Opera 7 and Internet Explorer)
......(Opera 6 and Mozilla)
......(Input)...
......(Tidy, BUG!)
.........(Internet Explorer)
...
......(Mozilla, Opera 6)
...
......(Opera 7) or something similar, they could also be closing the
...
...and propagate theinto the newly opened. Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are dissallowed in
, Tidy neither detects this nor does it perform any cleanup operation. Tidy should at least issue a warning if it encounters such constructs. Todo: discarding is abviously a bug, it should be replaced by
. */ TY_(InsertNodeAfterElement)(pre, node); TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE); ParseTag(doc, node, IgnoreWhitespace); newnode = TY_(InferredTag)(doc, TidyTag_PRE); TY_(ReportError)(doc, pre, newnode, INSERTING_TAG); pre = newnode; TY_(InsertNodeAfterElement)(node, pre); continue; } if ( nodeIsP(node) ) { if (node->type == StartTag) { TY_(ReportError)(doc, pre, node, USING_BR_INPLACE_OF); /* trim white space beforein
*/ TrimSpaces(doc, pre); /* coerce bothand
to
*/ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */ TY_(InsertNodeAtEnd)( pre, node ); } else { TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); } continue; } if ( TY_(nodeIsElement)(node) ) { /* trim white space before
*/ if ( nodeIsBR(node) ) TrimSpaces(doc, pre); TY_(InsertNodeAtEnd)(pre, node); ParseTag(doc, node, Preformatted); continue; } /* discard unexpected tags */ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); } TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_FOR); } void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) { Lexer* lexer = doc->lexer; Node *node; lexer->insert = NULL; /* defer implicit inline start tags */ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) { if (node->tag == field->tag && node->type == EndTag) { TY_(FreeNode)( doc, node); field->closed = yes; TrimSpaces(doc, field); return; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; if ( node->type == StartTag && (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) ) { if ( nodeIsOPTGROUP(node) ) TY_(ReportError)(doc, field, node, CANT_BE_NESTED); TY_(InsertNodeAtEnd)(field, node); ParseTag(doc, node, MixedContent); continue; } /* discard unexpected tags */ TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node); } } void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) { #if !defined(NDEBUG) && defined(_MSC_VER) static int in_parse_select = 0; #endif Lexer* lexer = doc->lexer; Node *node; lexer->insert = NULL; /* defer implicit inline start tags */ #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_select++; SPRTF("Entering ParseSelect %d...\n",in_parse_select); #endif while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) { if (node->tag == field->tag && node->type == EndTag) { TY_(FreeNode)( doc, node); field->closed = yes; TrimSpaces(doc, field); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_select--; SPRTF("Exit ParseSelect 1 %d...\n",in_parse_select); #endif return; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; if ( node->type == StartTag && ( nodeIsOPTION(node) || nodeIsOPTGROUP(node) || nodeIsDATALIST(node) || nodeIsSCRIPT(node)) ) { TY_(InsertNodeAtEnd)(field, node); ParseTag(doc, node, IgnoreWhitespace); continue; } /* discard unexpected tags */ TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); } TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_select--; SPRTF("Exit ParseSelect 2 %d...\n",in_parse_select); #endif } /* HTML5 */ void TY_(ParseDatalist)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) { #if !defined(NDEBUG) && defined(_MSC_VER) static int in_parse_datalist = 0; #endif Lexer* lexer = doc->lexer; Node *node; lexer->insert = NULL; /* defer implicit inline start tags */ #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_datalist++; SPRTF("Entering ParseDatalist %d...\n",in_parse_datalist); #endif while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) { if (node->tag == field->tag && node->type == EndTag) { TY_(FreeNode)( doc, node); field->closed = yes; TrimSpaces(doc, field); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_datalist--; SPRTF("Exit ParseDatalist 1 %d...\n",in_parse_datalist); #endif return; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; if ( node->type == StartTag && ( nodeIsOPTION(node) || nodeIsOPTGROUP(node) || nodeIsDATALIST(node) || nodeIsSCRIPT(node)) ) { TY_(InsertNodeAtEnd)(field, node); ParseTag(doc, node, IgnoreWhitespace); continue; } /* discard unexpected tags */ TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); } TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_datalist--; SPRTF("Exit ParseDatalist 2 %d...\n",in_parse_datalist); #endif } void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode) { Lexer* lexer = doc->lexer; Node *node; lexer->insert = NULL; /* defer implicit inline start tags */ if ( nodeIsTEXTAREA(field) ) mode = Preformatted; else mode = MixedContent; /* kludge for font tags */ while ((node = TY_(GetToken)(doc, mode)) != NULL) { if (node->tag == field->tag && node->type == EndTag) { TY_(FreeNode)( doc, node); field->closed = yes; TrimSpaces(doc, field); return; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; if (TY_(nodeIsText)(node)) { /* only called for 1st child */ if (field->content == NULL && !(mode & Preformatted)) TrimSpaces(doc, field); if (node->start >= node->end) { TY_(FreeNode)( doc, node); continue; } TY_(InsertNodeAtEnd)(field, node); continue; } /* for textarea should all cases of < and & be escaped? */ /* discard inline tags e.g. font */ if ( node->tag && node->tag->model & CM_INLINE && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */ { TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } /* terminate element on other tags */ if (!(field->tag->model & CM_OPT)) TY_(ReportError)(doc, field, node, MISSING_ENDTAG_BEFORE); TY_(UngetToken)( doc ); TrimSpaces(doc, field); return; } if (!(field->tag->model & CM_OPT)) TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR); } void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode)) { Node *node; while ((node = TY_(GetToken)(doc, MixedContent)) != NULL) { if (node->tag == title->tag && node->type == StartTag && cfgBool(doc, TidyCoerceEndTags) ) { TY_(ReportError)(doc, title, node, COERCE_TO_ENDTAG); node->type = EndTag; TY_(UngetToken)( doc ); continue; } else if (node->tag == title->tag && node->type == EndTag) { TY_(FreeNode)( doc, node); title->closed = yes; TrimSpaces(doc, title); return; } if (TY_(nodeIsText)(node)) { /* only called for 1st child */ if (title->content == NULL) TrimInitialSpace(doc, title, node); if (node->start >= node->end) { TY_(FreeNode)( doc, node); continue; } TY_(InsertNodeAtEnd)(title, node); continue; } /* deal with comments etc. */ if (InsertMisc(title, node)) continue; /* discard unknown tags */ if (node->tag == NULL) { TY_(ReportError)(doc, title, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node); continue; } /* pushback unexpected tokens */ TY_(ReportError)(doc, title, node, MISSING_ENDTAG_BEFORE); TY_(UngetToken)( doc ); TrimSpaces(doc, title); return; } TY_(ReportError)(doc, title, node, MISSING_ENDTAG_FOR); } /* This isn't quite right for CDATA content as it recognises tags within the content and parses them accordingly. This will unfortunately screw up scripts which include < + letter, < + !, < + ? or < + / + letter */ void TY_(ParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode)) { Node *node; doc->lexer->parent = script; node = TY_(GetToken)(doc, CdataContent); doc->lexer->parent = NULL; if (node) { TY_(InsertNodeAtEnd)(script, node); } else { /* handle e.g. a document like "