hello world to
hello world Trims initial space, by moving it before the start tag, or if this element is the first in parent's content, then by discarding the space */ static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text ) { Lexer* lexer = doc->lexer; Node *prev, *node; if ( TY_(nodeIsText)(text) && lexer->lexbuf[text->start] == ' ' && text->start < text->end ) { if ( (element->tag->model & CM_INLINE) && !(element->tag->model & CM_FIELD) ) { prev = element->prev; if (TY_(nodeIsText)(prev)) { if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ') lexer->lexbuf[(prev->end)++] = ' '; ++(element->start); } else /* create new node */ { node = TY_(NewNode)(lexer->allocator, lexer); node->start = (element->start)++; node->end = element->start; lexer->lexbuf[node->start] = ' '; TY_(InsertNodeBeforeElement)(element ,node); #if !defined(NDEBUG) && defined(_MSC_VER) SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n", (element->element ? element->element : "unknown")); #endif } } /* discard the space in current node */ ++(text->start); } } static Bool IsPreDescendant(Node* node) { Node *parent = node->parent; while (parent) { if (parent->tag && parent->tag->parser == TY_(ParsePre)) return yes; parent = parent->parent; } return no; } static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) { Node* next; if (!TY_(nodeIsText)(node)) return no; if (node->parent->type == DocTypeTag) return no; if (IsPreDescendant(node)) return no; if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) return no; next = node->next; /*
...
*/ if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE)) return yes; /*...
...
*/ if (next->type == StartTag) return yes; /* ...*/ if (next->type == StartEndTag) return yes; /* evil adjacent text nodes, Tidy should not generate these :-( */ if (TY_(nodeIsText)(next) && next->start < next->end && TY_(IsWhite)(doc->lexer->lexbuf[next->start])) return yes; return no; } static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node) { if (!TY_(nodeIsText)(node)) return no; if (node->parent->type == DocTypeTag) return no; if (IsPreDescendant(node)) return no; if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) return no; /*
...
......
...
*/ if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE)) return yes; /*...
... */ if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) && TY_(nodeIsElement)(node->prev)) return yes; /*...
*/ if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE)) return yes; return no; } static void CleanSpaces(TidyDocImpl* doc, Node* node) { Node* next; while (node) { next = node->next; if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node)) while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start])) ++(node->start); if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node)) while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1])) --(node->end); if (TY_(nodeIsText)(node) && !(node->start < node->end)) { TY_(RemoveNode)(node); TY_(FreeNode)(doc, node); node = next; continue; } if (node->content) CleanSpaces(doc, node->content); node = next; } } /* Move initial and trailing space out. This routine maps: hello world to hello world and hello world to hello world */ static void TrimSpaces( TidyDocImpl* doc, Node *element) { Node* text = element->content; if (nodeIsPRE(element) || IsPreDescendant(element)) return; if (TY_(nodeIsText)(text)) TrimInitialSpace(doc, element, text); text = element->last; if (TY_(nodeIsText)(text)) TrimTrailingSpace(doc, element, text); } static Bool DescendantOf( Node *element, TidyTagId tid ) { Node *parent; for ( parent = element->parent; parent != NULL; parent = parent->parent ) { if ( TagIsId(parent, tid) ) return yes; } return no; } static Bool InsertMisc(Node *element, Node *node) { if (node->type == CommentTag || node->type == ProcInsTag || node->type == CDATATag || node->type == SectionTag || node->type == AspTag || node->type == JsteTag || node->type == PhpTag ) { TY_(InsertNodeAtEnd)(element, node); return yes; } if ( node->type == XmlDecl ) { Node* root = element; while ( root && root->parent ) root = root->parent; if ( root && !(root->content && root->content->type == XmlDecl)) { TY_(InsertNodeAtStart)( root, node ); return yes; } } /* Declared empty tags seem to be slipping through ** the cracks. This is an experiment to figure out ** a decent place to pick them up. */ if ( node->tag && TY_(nodeIsElement)(node) && TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN && (node->tag->versions & VERS_PROPRIETARY) != 0 ) { TY_(InsertNodeAtEnd)(element, node); return yes; } return no; } static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode ) { Lexer* lexer = doc->lexer; if (node->tag == NULL) /* [i_a]2 prevent crash for active content (php, asp) docs */ return; /* Fix by GLP 2000-12-21. Need to reset insertspace if this is both a non-inline and empty tag (base, link, meta, isindex, hr, area). */ if (node->tag->model & CM_EMPTY) { lexer->waswhite = no; if (node->tag->parser == NULL) return; } else if (!(node->tag->model & CM_INLINE)) lexer->insertspace = no; if (node->tag->parser == NULL) return; if (node->type == StartEndTag) return; lexer->parent = node; /* [i_a]2 added this - not sure why - CHECKME: */ (*node->tag->parser)( doc, node, mode ); } /* the doctype has been found after other tags, and needs moving to before the html element */ static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype ) { Node* existing = TY_(FindDocType)( doc ); if ( existing ) { TY_(Report)(doc, element, doctype, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, doctype ); } else { TY_(Report)(doc, element, doctype, DOCTYPE_AFTER_TAGS ); while ( !nodeIsHTML(element) ) element = element->parent; TY_(InsertNodeBeforeElement)( element, doctype ); } } /* move node to the head, where element is used as starting point in hunt for head. normally called during parsing */ static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node ) { Node *head; TY_(RemoveNode)( node ); /* make sure that node is isolated */ if ( TY_(nodeIsElement)(node) ) { TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN ); head = TY_(FindHEAD)(doc); assert(head != NULL); TY_(InsertNodeAtEnd)(head, node); if ( node->tag->parser ) ParseTag( doc, node, IgnoreWhitespace ); } else { TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); TY_(FreeNode)( doc, node ); } } /* moves given node to end of body element */ static void MoveNodeToBody( TidyDocImpl* doc, Node* node ) { Node* body = TY_(FindBody)( doc ); if ( body ) { TY_(RemoveNode)( node ); TY_(InsertNodeAtEnd)( body, node ); } } static void AddClassNoIndent( TidyDocImpl* doc, Node *node ) { ctmbstr sprop = "padding-left: 2ex; margin-left: 0ex" "; margin-top: 0ex; margin-bottom: 0ex"; if ( !cfgBool(doc, TidyDecorateInferredUL) ) return; if ( cfgBool(doc, TidyMakeClean) ) TY_(AddStyleAsClass)( doc, node, sprop ); else TY_(AddStyleProperty)( doc, node, sprop ); } /* element is node created by the lexer upon seeing the start tag, or by the parser when the start tag is inferred */ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) { #if !defined(NDEBUG) && defined(_MSC_VER) static int in_parse_block = 0; static int parse_block_cnt = 0; #endif Lexer* lexer = doc->lexer; Node *node; Bool checkstack = yes; uint istackbase = 0; #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_block++; parse_block_cnt++; SPRTF("Entering ParseBlock %d... %d %s\n",in_parse_block,parse_block_cnt, ((element && element->element) ? element->element : "")); #endif if ( element->tag->model & CM_EMPTY ) { #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_block--; SPRTF("Exit ParseBlockL 1 %d...\n",in_parse_block); #endif return; } if ( nodeIsFORM(element) && DescendantOf(element, TidyTag_FORM) ) TY_(Report)(doc, element, NULL, ILLEGAL_NESTING ); /* InlineDup() asks the lexer to insert inline emphasis tags currently pushed on the istack, but take care to avoid propagating inline emphasis inside OBJECT or APPLET. For these elements a fresh inline stack context is created and disposed of upon reaching the end of the element. They thus behave like table cells in this respect. */ if (element->tag->model & CM_OBJECT) { istackbase = lexer->istackbase; lexer->istackbase = lexer->istacksize; } if (!(element->tag->model & CM_MIXED)) TY_(InlineDup)( doc, NULL ); /*\ * Issue #212 - If it is likely that it may be necessary * to move a leading space into a text node before this * element, then keep the mode MixedContent to keep any * leading space \*/ if ( !(element->tag->model & CM_INLINE) || (element->tag->model & CM_FIELD ) ) { mode = IgnoreWhitespace; } else if (mode == IgnoreWhitespace) { /* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace' when such a leading space may need to be inserted before this element to preverve the browser view */ mode = MixedContent; } while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL) { /* end tag for this element */ if (node->type == EndTag && node->tag && (node->tag == element->tag || element->was == node->tag)) { TY_(FreeNode)( doc, node ); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) TY_(PopInline)( doc, NULL ); lexer->istackbase = istackbase; } element->closed = yes; TrimSpaces( doc, element ); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_block--; SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block); #endif return; } if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) ) { if ( TY_(nodeIsElement)(node) ) TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } if (node->type == EndTag) { if (node->tag == NULL) { TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } else if ( nodeIsBR(node) ) node->type = StartTag; else if ( nodeIsP(node) ) { /* Cannot have a block inside a paragraph, so no checking for an ancestor is necessary -- but we _can_ have paragraphs inside a block, so change it to an implicit empty paragraph, to be dealt with according to the user's options */ node->type = StartEndTag; node->implicit = yes; } else if (DescendantOf( element, node->tag->id )) { /* if this is the end tag for an ancestor element then infer end tag for this element */ TY_(UngetToken)( doc ); break; } else { /* special case etc. for stuff moved in front of table */ if ( lexer->exiled && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) { TY_(UngetToken)( doc ); TrimSpaces( doc, element ); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_block--; SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block); #endif return; } } } /* mixed content model permits text */ if (TY_(nodeIsText)(node)) { if ( checkstack ) { checkstack = no; if (!(element->tag->model & CM_MIXED)) { if ( TY_(InlineDup)(doc, node) > 0 ) continue; } } TY_(InsertNodeAtEnd)(element, node); mode = MixedContent; /* HTML4 strict doesn't allow mixed content for elements with %block; as their content model */ /* But only body, map, blockquote, form and noscript have content model %block; */ if ( nodeIsBODY(element) || nodeIsMAP(element) || nodeIsBLOCKQUOTE(element) || nodeIsFORM(element) || nodeIsNOSCRIPT(element) ) TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); continue; } if ( InsertMisc(element, node) ) continue; /* allow PARAM elements? */ if ( nodeIsPARAM(node) ) { if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) ) { TY_(InsertNodeAtEnd)(element, node); continue; } /* otherwise discard it */ TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* allow AREA elements? */ if ( nodeIsAREA(node) ) { if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) ) { TY_(InsertNodeAtEnd)(element, node); continue; } /* otherwise discard it */ TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* ignore unknown start/end tags */ if ( node->tag == NULL ) { TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* Allow CM_INLINE elements here. Allow CM_BLOCK elements here unless lexer->excludeBlocks is yes. LI and DD are special cased. Otherwise infer end tag for this element. */ if ( !TY_(nodeHasCM)(node, CM_INLINE) ) { if ( !TY_(nodeIsElement)(node) ) { if ( nodeIsFORM(node) ) BadForm( doc ); TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } /* #427671 - Fix by Randy Waki - 10 Aug 00 */ /* If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION start tag, discard the start tag and let the subsequent content get parsed as content of the enclosing LI. This seems to mimic IE and Netscape, and avoids an infinite loop: without this check, ParseBlock (which is parsing the LI's content) and ParseList (which is parsing the LI's parent's content) repeatedly defer to each other to parse the illegal start tag, each time inferring a missing or*/ if ( nodeIsBR(node) ) TrimSpaces( doc, element ); TY_(InsertNodeAtEnd)(element, node); if (node->implicit) TY_(Report)(doc, element, node, INSERTING_TAG ); /* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an effort has been made above to set a 'MixedContent' mode in some cases? WHY IS THE 'mode' VARIABLE NOT USED HERE???? */ ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ ); continue; } /* discard unexpected tags */ if (node->type == EndTag) TY_(PopInline)( doc, node ); /* if inline end tag */ TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); TY_(FreeNode)( doc, node ); continue; } if (!(element->tag->model & CM_OPT)) TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while ( lexer->istacksize > lexer->istackbase ) TY_(PopInline)( doc, NULL ); lexer->istackbase = istackbase; } TrimSpaces( doc, element ); #if !defined(NDEBUG) && defined(_MSC_VER) in_parse_block--; SPRTF("Exit ParseBlock 10 %d...\n",in_parse_block); #endif } /* [i_a] svg / math */ struct MatchingDescendantData { Node *found_node; Bool *passed_marker_node; /* input: */ TidyTagId matching_tagId; Node *node_to_find; Node *marker_node; }; static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate) { struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate; if (TagId(node) == cb_data->matching_tagId) { /* make sure we match up 'unknown' tags exactly! */ if (cb_data->matching_tagId != TidyTag_UNKNOWN || (node->element != NULL && cb_data->node_to_find != NULL && cb_data->node_to_find->element != NULL && 0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element))) { cb_data->found_node = node; return ExitTraversal; } } if (cb_data->passed_marker_node && node == cb_data->marker_node) *cb_data->passed_marker_node = yes; return VisitParent; } /* Search the parent chain (from 'parent' upwards up to the root) for a node matching the given 'node'. When the search passes beyond the 'marker_node' (which is assumed to sit in the parent chain), this will be flagged by setting the boolean referenced by 'is_parent_of_marker' to yes. 'is_parent_of_marker' and 'marker_node' are optional parameters and may be NULL. */ static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker ) { struct MatchingDescendantData cb_data = { 0 }; cb_data.matching_tagId = TagId(node); cb_data.node_to_find = node; cb_data.marker_node = marker_node; assert(node); if (is_parent_of_marker) *is_parent_of_marker = no; TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data); return cb_data.found_node; } /* Act as a generic XML (sub)tree parser: collect each node and add it to the DOM, without any further validation. TODO : add schema- or other-hierarchy-definition-based validation of the subtree here... */ void TY_(ParseNamespace)(TidyDocImpl* doc, Node *basenode, GetTokenMode mode) { Lexer* lexer = doc->lexer; Node *node; Node *parent = basenode; uint istackbase; AttVal* av; /* #130 MathML attr and entity fix! */ /* a la
or | */ TY_(InsertNodeAtEnd)(row, node); exclude_state = lexer->excludeBlocks; lexer->excludeBlocks = no; ParseTag( doc, node, IgnoreWhitespace); lexer->excludeBlocks = exclude_state; /* pop inline stack */ while ( lexer->istacksize > lexer->istackbase ) TY_(PopInline)( doc, NULL ); } } void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode)) { Lexer* lexer = doc->lexer; Node *node, *parent; if (rowgroup->tag->model & CM_EMPTY) return; while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) { if (node->tag == rowgroup->tag) { if (node->type == EndTag) { rowgroup->closed = yes; TY_(FreeNode)( doc, node); return; } TY_(UngetToken)( doc ); return; } /* if |
---|