/* parser.c -- HTML Parser
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy-int.h"
#include "lexer.h"
#include "parser.h"
#include "message.h"
#include "clean.h"
#include "tags.h"
#include "tmbstr.h"
#include "sprtf.h"
/****************************************************************************//*
** MARK: - Forward Declarations
***************************************************************************/
static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode );
/****************************************************************************//*
** MARK: - Configuration Options
***************************************************************************/
/**
* Issue #72 - Need to know to avoid error-reporting - no warning only if
* --show-body-only yes.
* Issue #132 - Likewise avoid warning if showing body only.
*/
#define showingBodyOnly(doc) (cfgAutoBool(doc,TidyBodyOnly) == TidyYesState) ? yes : no
/****************************************************************************//*
** MARK: - Node Operations
***************************************************************************/
/**
* Insert "node" into markup tree in place of "element"
* which is moved to become the child of the node
*/
static void InsertNodeAsParent(Node *element, Node *node)
{
node->content = element;
node->last = element;
node->parent = element->parent;
element->parent = node;
if (node->parent->content == element)
node->parent->content = node;
if (node->parent->last == element)
node->parent->last = node;
node->prev = element->prev;
element->prev = NULL;
if (node->prev)
node->prev->next = node;
node->next = element->next;
element->next = NULL;
if (node->next)
node->next->prev = node;
}
/**
* Inserts node into element at an appropriate location based
* on the type of node being inserted.
*/
static Bool InsertMisc(Node *element, Node *node)
{
if (node->type == CommentTag ||
node->type == ProcInsTag ||
node->type == CDATATag ||
node->type == SectionTag ||
node->type == AspTag ||
node->type == JsteTag ||
node->type == PhpTag )
{
TY_(InsertNodeAtEnd)(element, node);
return yes;
}
if ( node->type == XmlDecl )
{
Node* root = element;
while ( root && root->parent )
root = root->parent;
if ( root && !(root->content && root->content->type == XmlDecl))
{
TY_(InsertNodeAtStart)( root, node );
return yes;
}
}
/* Declared empty tags seem to be slipping through
** the cracks. This is an experiment to figure out
** a decent place to pick them up.
*/
if ( node->tag &&
TY_(nodeIsElement)(node) &&
TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN &&
(node->tag->versions & VERS_PROPRIETARY) != 0 )
{
TY_(InsertNodeAtEnd)(element, node);
return yes;
}
return no;
}
/**
* Move node to the head, where element is used as starting
* point in hunt for head. normally called during parsing.
*/
static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
{
Node *head;
TY_(RemoveNode)( node ); /* make sure that node is isolated */
if ( TY_(nodeIsElement)(node) )
{
TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN );
head = TY_(FindHEAD)(doc);
assert(head != NULL);
TY_(InsertNodeAtEnd)(head, node);
if ( node->tag->parser )
ParseTag( doc, node, IgnoreWhitespace );
}
else
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node );
}
}
/**
* Moves given node to end of body element.
*/
static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
{
Node* body = TY_(FindBody)( doc );
if ( body )
{
TY_(RemoveNode)( node );
TY_(InsertNodeAtEnd)( body, node );
}
}
/**
* Unexpected content in table row is moved to just before the table in
* in accordance with Netscape and IE. This code assumes that node hasn't
* been inserted into the row.
*/
static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
Node *node )
{
Node *table;
/* first find the table element */
for (table = row->parent; table; table = table->parent)
{
if ( nodeIsTABLE(table) )
{
TY_(InsertNodeBeforeElement)( table, node );
return;
}
}
/* No table element */
TY_(InsertNodeBeforeElement)( row->parent, node );
}
/**
* Generalised search for duplicate elements.
* Issue #166 - repeated element.
*/
static Bool findNodeWithId( Node *node, TidyTagId tid )
{
Node *content;
while (node)
{
if (TagIsId(node,tid))
return yes;
/*\
* Issue #459 - Under certain circumstances, with many node this use of
* 'for (content = node->content; content; content = content->content)'
* would produce a **forever** circle, or at least a very extended loop...
* It is sufficient to test the content, if it exists,
* to quickly iterate all nodes. Now all nodes are tested only once.
\*/
content = node->content;
if (content)
{
if ( findNodeWithId(content,tid) )
return yes;
}
node = node->next;
}
return no;
}
/**
* Perform a global search for an element.
* Issue #166 - repeated element
*/
static Bool findNodeById( TidyDocImpl* doc, TidyTagId tid )
{
Node *node = (doc ? doc->root.content : NULL);
return findNodeWithId( node,tid );
}
/***************************************************************************//*
** MARK: - Decision Making
***************************************************************************/
/**
* Indicates whether or not element can be pruned based on content,
* user settings, etc.
*/
static Bool CanPrune( TidyDocImpl* doc, Node *element )
{
if ( !cfgBool(doc, TidyDropEmptyElems) )
return no;
if ( TY_(nodeIsText)(element) )
return yes;
if ( element->content )
return no;
if ( element->tag == NULL )
return no;
if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
return no;
if ( nodeIsA(element) && element->attributes != NULL )
return no;
if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
return no;
if ( element->tag->model & CM_ROW )
return no;
if ( element->tag->model & CM_EMPTY )
return no;
if ( nodeIsAPPLET(element) )
return no;
if ( nodeIsOBJECT(element) )
return no;
if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
return no;
if ( nodeIsTITLE(element) )
return no;
/* #433359 - fix by Randy Waki 12 Mar 01 */
if ( nodeIsIFRAME(element) )
return no;
/* fix for bug 770297 */
if (nodeIsTEXTAREA(element))
return no;
/* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */
if (nodeIsCANVAS(element))
return no;
if (nodeIsPROGRESS(element))
return no;
if ( attrGetID(element) || attrGetNAME(element) )
return no;
/* fix for bug 695408; a better fix would look for unknown and */
/* known proprietary attributes that make the element significant */
if (attrGetDATAFLD(element))
return no;
/* fix for bug 723772, don't trim new-...-tags */
if (element->tag->id == TidyTag_UNKNOWN)
return no;
if (nodeIsBODY(element))
return no;
if (nodeIsCOLGROUP(element))
return no;
/* HTML5 - do NOT drop empty option if it has attributes */
if ( nodeIsOPTION(element) && element->attributes != NULL )
return no;
/* fix for #103 - don't drop empty dd tags lest document not validate */
if (nodeIsDD(element))
return no;
return yes;
}
/**
* Indicates whether or not node is a descendant of a tag of the given tid.
*/
static Bool DescendantOf( Node *element, TidyTagId tid )
{
Node *parent;
for ( parent = element->parent;
parent != NULL;
parent = parent->parent )
{
if ( TagIsId(parent, tid) )
return yes;
}
return no;
}
/**
* Indicates whether or not node is a descendant of a pre tag.
*/
static Bool IsPreDescendant(Node* node)
{
Node *parent = node->parent;
while (parent)
{
if (parent->tag && parent->tag->parser == TY_(ParsePre))
return yes;
parent = parent->parent;
}
return no;
}
/**
* Indicates whether or not trailing whitespace should be cleaned.
*/
static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
{
Node* next;
if (!TY_(nodeIsText)(node))
return no;
if (node->parent->type == DocTypeTag)
return no;
if (IsPreDescendant(node))
return no;
if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
return no;
next = node->next;
/* ...
*/
if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE))
return yes;
/* ...
...
*/
if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE))
return yes;
if (!next)
return no;
if (nodeIsBR(next))
return yes;
if (TY_(nodeHasCM)(next, CM_INLINE))
return no;
/* ... ...
*/
if (next->type == StartTag)
return yes;
/* ...
*/
if (next->type == StartEndTag)
return yes;
/* evil adjacent text nodes, Tidy should not generate these :-( */
if (TY_(nodeIsText)(next) && next->start < next->end
&& TY_(IsWhite)(doc->lexer->lexbuf[next->start]))
return yes;
return no;
}
/**
* Indicates whether or not leading whitespace should be cleaned.
*/
static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
{
if (!TY_(nodeIsText)(node))
return no;
if (node->parent->type == DocTypeTag)
return no;
if (IsPreDescendant(node))
return no;
if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
return no;
/* ...
......
*/
if (nodeIsBR(node->prev))
return yes;
/* ...
*/
if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE))
return yes;
/* ...
... */
if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) &&
TY_(nodeIsElement)(node->prev))
return yes;
/* ...
*/
if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE))
return yes;
return no;
}
/**
* Indicates whether or not the content of the given node is acceptable
* content for pre elements
*/
static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node )
{
/* p is coerced to br's, Text OK too */
if ( nodeIsP(node) || TY_(nodeIsText)(node) )
return yes;
if ( node->tag == NULL ||
nodeIsPARAM(node) ||
!TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) )
return no;
return yes;
}
/**
* Indicates whether or not the only content model for the given node
* is CM_INLINE.
*/
static Bool nodeCMIsOnlyInline( Node* node )
{
return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK );
}
/***************************************************************************//*
** MARK: - Information Accumulation
***************************************************************************/
/**
* Errors in positioning of form start or end tags
* generally require human intervention to fix.
* Issue #166 - repeated element also uses this flag
* to indicate duplicates, discarded.
*/
static void BadForm( TidyDocImpl* doc )
{
doc->badForm |= flg_BadForm;
}
/***************************************************************************//*
** MARK: - Fixes and Touchup
***************************************************************************/
/**
* This maps
* hello world
* to
* hello world
*
* If last child of element is a text node
* then trim trailing white space character
* moving it to after element's end tag.
*/
static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
{
Lexer* lexer = doc->lexer;
byte c;
if (TY_(nodeIsText)(last))
{
if (last->end > last->start)
{
c = (byte) lexer->lexbuf[ last->end - 1 ];
if ( c == ' ' )
{
last->end -= 1;
if ( (element->tag->model & CM_INLINE) &&
!(element->tag->model & CM_FIELD) )
lexer->insertspace = yes;
}
}
}
}
/**
* This maps
* hello world
* to
*
hello world
*
* Trims initial space, by moving it before the
* start tag, or if this element is the first in
* parent's content, then by discarding the space
*/
static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
{
Lexer* lexer = doc->lexer;
Node *prev, *node;
if ( TY_(nodeIsText)(text) &&
lexer->lexbuf[text->start] == ' ' &&
text->start < text->end )
{
if ( (element->tag->model & CM_INLINE) &&
!(element->tag->model & CM_FIELD) )
{
prev = element->prev;
if (TY_(nodeIsText)(prev))
{
if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
lexer->lexbuf[(prev->end)++] = ' ';
++(element->start);
}
else /* create new node */
{
node = TY_(NewNode)(lexer->allocator, lexer);
node->start = (element->start)++;
node->end = element->start;
lexer->lexbuf[node->start] = ' ';
TY_(InsertNodeBeforeElement)(element ,node);
DEBUG_LOG(SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n",
(element->element ? element->element : "unknown")));
}
}
/* discard the space in current node */
++(text->start);
}
}
/**
* Cleans whitespace from text nodes, and drops such nodes if emptied
* completely as a result.
*/
static void CleanSpaces(TidyDocImpl* doc, Node* node)
{
Stack *stack = TY_(newStack)(doc, 16);
Node *next;
while (node)
{
next = node->next;
if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node))
while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start]))
++(node->start);
if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node))
while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1]))
--(node->end);
if (TY_(nodeIsText)(node) && !(node->start < node->end))
{
TY_(RemoveNode)(node);
TY_(FreeNode)(doc, node);
node = next ? next : TY_(pop)(stack);
continue;
}
if (node->content)
{
TY_(push)(stack, next);
node = node->content;
continue;
}
node = next ? next : TY_(pop)(stack);
}
TY_(freeStack)(stack);
}
/**
* Move initial and trailing space out.
* This routine maps:
* hello world
* to
* hello world
* and
* hello world
* to
* hello world
*/
static void TrimSpaces( TidyDocImpl* doc, Node *element)
{
Node* text = element->content;
if (nodeIsPRE(element) || IsPreDescendant(element))
return;
if (TY_(nodeIsText)(text))
TrimInitialSpace(doc, element, text);
text = element->last;
if (TY_(nodeIsText)(text))
TrimTrailingSpace(doc, element, text);
}
/**
* The doctype has been found after other tags,
* and needs moving to before the html element
*/
static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
{
Node* existing = TY_(FindDocType)( doc );
if ( existing )
{
TY_(Report)(doc, element, doctype, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, doctype );
}
else
{
TY_(Report)(doc, element, doctype, DOCTYPE_AFTER_TAGS );
while ( !nodeIsHTML(element) )
element = element->parent;
TY_(InsertNodeBeforeElement)( element, doctype );
}
}
/**
* Adds style information as a class in the document or a property
* of the node to prevent indentation of inferred UL tags.
*/
static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
{
ctmbstr sprop =
"padding-left: 2ex; margin-left: 0ex"
"; margin-top: 0ex; margin-bottom: 0ex";
if ( !cfgBool(doc, TidyDecorateInferredUL) )
return;
if ( cfgBool(doc, TidyMakeClean) )
TY_(AddStyleAsClass)( doc, node, sprop );
else
TY_(AddStyleProperty)( doc, node, sprop );
}
/**
* If a table row is empty then insert an empty cell. This practice is
* consistent with browser behavior and avoids potential problems with
* row spanning cells.
*/
static void FixEmptyRow(TidyDocImpl* doc, Node *row)
{
Node *cell;
if (row->content == NULL)
{
cell = TY_(InferredTag)(doc, TidyTag_TD);
TY_(InsertNodeAtEnd)(row, cell);
TY_(Report)(doc, row, cell, MISSING_STARTTAG);
}
}
/***************************************************************************//*
** MARK: - Parsers Support
***************************************************************************/
/**
* Structure used by FindDescendant_cb.
*/
struct MatchingDescendantData
{
Node *found_node;
Bool *passed_marker_node;
/* input: */
TidyTagId matching_tagId;
Node *node_to_find;
Node *marker_node;
};
/**
* The main engine for FindMatchingDescendant.
*/
static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate)
{
struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate;
if (TagId(node) == cb_data->matching_tagId)
{
/* make sure we match up 'unknown' tags exactly! */
if (cb_data->matching_tagId != TidyTag_UNKNOWN ||
(node->element != NULL &&
cb_data->node_to_find != NULL &&
cb_data->node_to_find->element != NULL &&
0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element)))
{
cb_data->found_node = node;
return ExitTraversal;
}
}
if (cb_data->passed_marker_node && node == cb_data->marker_node)
*cb_data->passed_marker_node = yes;
return VisitParent;
}
/**
* Search the parent chain (from `parent` upwards up to the root) for a node
* matching the given 'node'.
*
* When the search passes beyond the `marker_node` (which is assumed to sit
* in the parent chain), this will be flagged by setting the boolean
* referenced by `is_parent_of_marker` to `yes`.
*
* 'is_parent_of_marker' and 'marker_node' are optional parameters and may
* be NULL.
*/
static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker )
{
struct MatchingDescendantData cb_data = { 0 };
cb_data.matching_tagId = TagId(node);
cb_data.node_to_find = node;
cb_data.marker_node = marker_node;
assert(node);
if (is_parent_of_marker)
*is_parent_of_marker = no;
TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data);
return cb_data.found_node;
}
/**
* Finds the last list item for the given list, providing it in the
* in-out parameter. Returns yes or no if the item was the last list
* item.
*/
static Bool FindLastLI( Node *list, Node **lastli )
{
Node *node;
*lastli = NULL;
for ( node = list->content; node ; node = node->next )
if ( nodeIsLI(node) && node->type == StartTag )
*lastli=node;
return *lastli ? yes:no;
}
/***************************************************************************//*
** MARK: - Parser Stack
***************************************************************************/
/**
* Allocates and initializes the parser's stack.
*/
void TY_(InitParserStack)( TidyDocImpl* doc )
{
uint default_size = 16;
TidyParserMemory *content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * default_size );
doc->stack.content = content;
doc->stack.size = default_size;
doc->stack.top = -1;
doc->stack.allocator = doc->allocator;
}
/**
* Frees the parser's stack when done.
*/
void TY_(FreeParserStack)( TidyDocImpl* doc )
{
TidyFree( doc->stack.allocator, doc->stack.content );
doc->stack.content = NULL;
doc->stack.size = 0;
doc->stack.top = -1;
}
/**
* Increase the stack size.
* TODO: don't overflow max_uint. Need a message when we can no longer increase the size beyond 429 million depth.
*/
static void growParserStack( TidyDocImpl* doc )
{
TidyParserMemory *content;
content = (TidyParserMemory *) TidyAlloc( doc->stack.allocator, sizeof(TidyParserMemory) * doc->stack.size * 2 );
memcpy( content, doc->stack.content, sizeof(TidyParserMemory) * (doc->stack.top + 1) );
TidyFree(doc->stack.allocator, doc->stack.content);
doc->stack.content = content;
doc->stack.size = doc->stack.size * 2;
}
/**
* Indicates whether or not the stack is empty.
*/
static Bool isEmptyParserStack( TidyDocImpl* doc )
{
return doc->stack.top < 0;
}
/**
* Push the parser memory to the stack.
*/
static void pushMemory( TidyDocImpl* doc, TidyParserMemory data )
{
if ( doc->stack.top == doc->stack.size - 1 )
growParserStack( doc );
doc->stack.top++;
doc->stack.content[doc->stack.top] = data;
}
/**
* Peek at the parser memory.
*/
static FUNC_UNUSED TidyParserMemory peekMemory( TidyDocImpl* doc )
{
return doc->stack.content[doc->stack.top];
}
/**
* Peek at the parser memory "mode" field. This is just a convenience
* to avoid having to create a new struct instance in the caller.
*/
static GetTokenMode peekMemoryMode( TidyDocImpl* doc )
{
return doc->stack.content[doc->stack.top].mode;
}
/**
* Peek at the parser memory "identity" field. This is just a convenience
* to avoid having to create a new struct instance in the caller.
*/
static Parser* peekMemoryIdentity( TidyDocImpl* doc )
{
return doc->stack.content[doc->stack.top].identity;
}
/**
* Pop out a parser memory.
*/
static TidyParserMemory popMemory( TidyDocImpl* doc )
{
if ( !isEmptyParserStack( doc ) )
{
TidyParserMemory data = doc->stack.content[doc->stack.top];
doc->stack.top = doc->stack.top - 1;
return data;
}
TidyParserMemory blank = { NULL };
return blank;
}
/***************************************************************************//*
** MARK: - Parser Search and Instantiation
***************************************************************************/
/**
* Retrieves the correct parser for the given node, accounting for various
* conditions, and readies the lexer for parsing that node.
*/
static Parser* GetParserForNode( TidyDocImpl* doc, Node *node )
{
Lexer* lexer = doc->lexer;
/* [i_a]2 prevent crash for active content (php, asp) docs */
if (node->tag == NULL)
return NULL;
/*
Fix by GLP 2000-12-21. Need to reset insertspace if this is both
a non-inline and empty tag (base, link, meta, isindex, hr, area).
*/
if (node->tag->model & CM_EMPTY)
{
lexer->waswhite = no;
if (node->tag->parser == NULL)
return NULL;
}
else if (!(node->tag->model & CM_INLINE))
lexer->insertspace = no;
if (node->tag->parser == NULL)
return NULL;
if (node->type == StartEndTag)
return NULL;
/* [i_a]2 added this - not sure why - CHECKME: */
lexer->parent = node;
return (node->tag->parser);
}
/**
* Instantiates the correct parser for the given node. This is currently
* maintained ONLY until the legacy parsers have been ported, as this
* introduces recursion when used.
*/
static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode )
{
Parser* parser = GetParserForNode( doc, node );
if ( parser )
(*parser)( doc, node, mode, no );
}
/**
* The main parser body will populate the document's document root starting
* with the provided node, which generally should be the HTML node after the
* pre-HTML stuff is handled at a higher level.
*
* This parser works cooperatively with compliant parsers to pass state
* information back and forth in the TidyDocImpl's `stack`, which resides on
* the heap and prevents recursion and stack exhaustion, and also works well
* with the old-style parsers that do recurse.
*
* (The goal is to update the old-style parsers slowly and deliberately
* without causing regressions, in a series of smaller commits and updates.)
*/
void ParseHTMLWithNode( TidyDocImpl* doc, Node* node )
{
GetTokenMode mode = IgnoreWhitespace;
Parser* parser = NULL;
/*
This main loop is only extinguished when all of the parser tokens are
consumed. Note that most of the parsers consume tokens as well, and
so what we're really doing here is managing parsers and preventing
recursion with cooperating parsers.
*/
while ( node )
{
if ( (parser = GetParserForNode( doc, node )) )
{
if ( (node = parser( doc, node, mode, no )) )
{
/*
When a parser returns a node, it means that we have
to continue the loop rather than moving on, because it
indicates that the parser encountered a token it does not
handle. It also tells us the correct GetTokenMode to use
for it via the struct that it pushed:
*/
mode = peekMemoryMode( doc );
continue;
}
}
/*
If we've come this far, the parser has bottomed out, and won't be
going any deeper. Now we run back up the stack to close all of the
open elements and handle any parser post-processing that was needed.
Of course, other nodes might cause us to deepen the stack again, too.
*/
if ( !isEmptyParserStack( doc ) )
{
if ( (parser = peekMemoryIdentity( doc )) )
{
if ( (node = parser( doc, NULL, 0, yes )) )
{
/* Another assignment from the parser. */
mode = peekMemoryMode( doc );
continue;
}
} else {
/*
There's no identity in the stack (it was used to pass back
a GetToken mode, and nothing else, so remove discard it.
*/
popMemory( doc );
}
}
/*
Assuming we've gotten this far, there's no more work to do and
so we can draw a nice, fresh token from the lexer.
*/
node = TY_(GetToken)( doc, mode );
}
}
/***************************************************************************//*
** MARK: - Old Parsers
***************************************************************************/
/** MARK: TY_(oldParseBlock)
* `element` is a node created by the lexer upon seeing the start tag, or
* by the parser when the start tag is inferred
*/
void* TY_(oldParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
{
#if defined(ENABLE_DEBUG_LOG)
static int in_parse_block = 0;
static int parse_block_cnt = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node;
Bool checkstack = yes;
uint istackbase = 0;
#if defined(ENABLE_DEBUG_LOG)
in_parse_block++;
parse_block_cnt++;
SPRTF("Entering ParseBlock %d... %d %s\n",in_parse_block,parse_block_cnt,
((element && element->element) ? element->element : ""));
#endif
if ( element->tag->model & CM_EMPTY ) {
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlockL 1 %d...\n",in_parse_block);
#endif
return NULL;
}
if ( nodeIsFORM(element) &&
DescendantOf(element, TidyTag_FORM) )
TY_(Report)(doc, element, NULL, ILLEGAL_NESTING );
/*
InlineDup() asks the lexer to insert inline emphasis tags
currently pushed on the istack, but take care to avoid
propagating inline emphasis inside OBJECT or APPLET.
For these elements a fresh inline stack context is created
and disposed of upon reaching the end of the element.
They thus behave like table cells in this respect.
*/
if (element->tag->model & CM_OBJECT)
{
istackbase = lexer->istackbase;
lexer->istackbase = lexer->istacksize;
}
if (!(element->tag->model & CM_MIXED))
TY_(InlineDup)( doc, NULL );
/*\
* Issue #212 - If it is likely that it may be necessary
* to move a leading space into a text node before this
* element, then keep the mode MixedContent to keep any
* leading space
\*/
if ( !(element->tag->model & CM_INLINE) ||
(element->tag->model & CM_FIELD ) )
{
mode = IgnoreWhitespace;
}
else if (mode == IgnoreWhitespace)
{
/* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace'
when such a leading space may need to be inserted before this element to
preverve the browser view */
mode = MixedContent;
}
while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL)
{
/* end tag for this element */
if (node->type == EndTag && node->tag &&
(node->tag == element->tag || element->was == node->tag))
{
TY_(FreeNode)( doc, node );
if (element->tag->model & CM_OBJECT)
{
/* pop inline stack */
while (lexer->istacksize > lexer->istackbase)
TY_(PopInline)( doc, NULL );
lexer->istackbase = istackbase;
}
element->closed = yes;
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block);
#endif
return NULL;
}
if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
{
if ( TY_(nodeIsElement)(node) )
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
continue;
}
if (node->type == EndTag)
{
if (node->tag == NULL)
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
continue;
}
else if ( nodeIsBR(node) )
node->type = StartTag;
else if ( nodeIsP(node) )
{
/* Cannot have a block inside a paragraph, so no checking
for an ancestor is necessary -- but we _can_ have
paragraphs inside a block, so change it to an implicit
empty paragraph, to be dealt with according to the user's
options
*/
node->type = StartEndTag;
node->implicit = yes;
}
else if (DescendantOf( element, node->tag->id ))
{
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
TY_(UngetToken)( doc );
break;
}
else
{
/* special case etc. for stuff moved in front of table */
if ( lexer->exiled
&& (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
{
TY_(UngetToken)( doc );
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block);
#endif
return NULL;
}
}
}
/* mixed content model permits text */
if (TY_(nodeIsText)(node))
{
if ( checkstack )
{
checkstack = no;
if (!(element->tag->model & CM_MIXED))
{
if ( TY_(InlineDup)(doc, node) > 0 )
continue;
}
}
TY_(InsertNodeAtEnd)(element, node);
mode = MixedContent;
/*
HTML4 strict doesn't allow mixed content for
elements with %block; as their content model
*/
/*
But only body, map, blockquote, form and
noscript have content model %block;
*/
if ( nodeIsBODY(element) ||
nodeIsMAP(element) ||
nodeIsBLOCKQUOTE(element) ||
nodeIsFORM(element) ||
nodeIsNOSCRIPT(element) )
TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
continue;
}
if ( InsertMisc(element, node) )
continue;
/* allow PARAM elements? */
if ( nodeIsPARAM(node) )
{
if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) )
{
TY_(InsertNodeAtEnd)(element, node);
continue;
}
/* otherwise discard it */
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
continue;
}
/* allow AREA elements? */
if ( nodeIsAREA(node) )
{
if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) )
{
TY_(InsertNodeAtEnd)(element, node);
continue;
}
/* otherwise discard it */
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
continue;
}
/* ignore unknown start/end tags */
if ( node->tag == NULL )
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
continue;
}
/*
Allow CM_INLINE elements here.
Allow CM_BLOCK elements here unless
lexer->excludeBlocks is yes.
LI and DD are special cased.
Otherwise infer end tag for this element.
*/
if ( !TY_(nodeHasCM)(node, CM_INLINE) )
{
if ( !TY_(nodeIsElement)(node) )
{
if ( nodeIsFORM(node) )
BadForm( doc );
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
continue;
}
/* #427671 - Fix by Randy Waki - 10 Aug 00 */
/*
If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
start tag, discard the start tag and let the subsequent content get
parsed as content of the enclosing LI. This seems to mimic IE and
Netscape, and avoids an infinite loop: without this check,
ParseBlock (which is parsing the LI's content) and ParseList (which
is parsing the LI's parent's content) repeatedly defer to each
other to parse the illegal start tag, each time inferring a missing
or
respectively.
NOTE: This check is a bit fragile. It specifically checks for the
four tags that happen to weave their way through the current series
of tests performed by ParseBlock and ParseList to trigger the
infinite loop.
*/
if ( nodeIsLI(element) )
{
if ( nodeIsFRAME(node) ||
nodeIsFRAMESET(node) ||
nodeIsOPTGROUP(node) ||
nodeIsOPTION(node) )
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node ); /* DSR - 27Apr02 avoid memory leak */
continue;
}
}
if ( nodeIsTD(element) || nodeIsTH(element) )
{
/* if parent is a table cell, avoid inferring the end of the cell */
if ( TY_(nodeHasCM)(node, CM_HEAD) )
{
MoveToHead( doc, element, node );
continue;
}
if ( TY_(nodeHasCM)(node, CM_LIST) )
{
TY_(UngetToken)( doc );
node = TY_(InferredTag)(doc, TidyTag_UL);
AddClassNoIndent(doc, node);
lexer->excludeBlocks = yes;
}
else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
{
TY_(UngetToken)( doc );
node = TY_(InferredTag)(doc, TidyTag_DL);
lexer->excludeBlocks = yes;
}
/* infer end of current table cell */
if ( !TY_(nodeHasCM)(node, CM_BLOCK) )
{
TY_(UngetToken)( doc );
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 3 %d...\n",in_parse_block);
#endif
return NULL;
}
}
else if ( TY_(nodeHasCM)(node, CM_BLOCK) )
{
if ( lexer->excludeBlocks )
{
if ( !TY_(nodeHasCM)(element, CM_OPT) )
TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
TY_(UngetToken)( doc );
if ( TY_(nodeHasCM)(element, CM_OBJECT) )
lexer->istackbase = istackbase;
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 4 %d...\n",in_parse_block);
#endif
return NULL;
}
}
else /* things like list items */
{
if (node->tag->model & CM_HEAD)
{
MoveToHead( doc, element, node );
continue;
}
/*
special case where a form start tag
occurs in a tr and is followed by td or th
*/
if ( nodeIsFORM(element) &&
nodeIsTD(element->parent) &&
element->parent->implicit )
{
if ( nodeIsTD(node) )
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
continue;
}
if ( nodeIsTH(node) )
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
node = element->parent;
TidyDocFree(doc, node->element);
node->element = TY_(tmbstrdup)(doc->allocator, "th");
node->tag = TY_(LookupTagDef)( TidyTag_TH );
continue;
}
}
if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit )
TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
/* #521, warn on missing optional end-tags if not omitting them. */
if ( cfgBool( doc, TidyOmitOptionalTags ) == no && TY_(nodeHasCM)(element, CM_OPT) )
TY_(Report)(doc, element, node, MISSING_ENDTAG_OPTIONAL );
TY_(UngetToken)( doc );
if ( TY_(nodeHasCM)(node, CM_LIST) )
{
if ( element->parent && element->parent->tag &&
element->parent->tag->parser == TY_(ParseList) )
{
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 5 %d...\n",in_parse_block);
#endif
return NULL;
}
node = TY_(InferredTag)(doc, TidyTag_UL);
AddClassNoIndent(doc, node);
}
else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
{
if ( nodeIsDL(element->parent) )
{
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 6 %d...\n",in_parse_block);
#endif
return NULL;
}
node = TY_(InferredTag)(doc, TidyTag_DL);
}
else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) )
{
/* http://tidy.sf.net/issue/1316307 */
/* In exiled mode, return so table processing can
continue. */
if (lexer->exiled) {
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 7 %d...\n",in_parse_block);
#endif
return NULL;
}
node = TY_(InferredTag)(doc, TidyTag_TABLE);
}
else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
{
/* pop inline stack */
while ( lexer->istacksize > lexer->istackbase )
TY_(PopInline)( doc, NULL );
lexer->istackbase = istackbase;
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 8 %d...\n",in_parse_block);
#endif
return NULL;
}
else
{
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 9 %d...\n",in_parse_block);
#endif
return NULL;
}
}
}
/*\
* Issue #307 - an tag to ends any open element
* Like #427827 - fixed by Randy Waki and Bjoern Hoehrmann 23 Aug 00
* in ParseInline(), fix copied HERE to ParseBlock()
* href: http://www.w3.org/TR/html-markup/a.html
* The interactive element a must not appear as a descendant of the a element.
\*/
if ( nodeIsA(node) && !node->implicit &&
(nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
{
if (node->type != EndTag && node->attributes == NULL
&& cfgBool(doc, TidyCoerceEndTags) )
{
node->type = EndTag;
TY_(Report)(doc, element, node, COERCE_TO_ENDTAG);
TY_(UngetToken)( doc );
continue;
}
if (nodeIsA(element))
{
TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
TY_(UngetToken)( doc );
}
else
{
/* Issue #597 - if we not 'UngetToken' then it is being discarded.
Add message, and 'FreeNode' - thanks @ralfjunker */
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)(doc, node);
}
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 9b %d...\n",in_parse_block);
#endif
return NULL;
}
/* parse known element */
if (TY_(nodeIsElement)(node))
{
if (node->tag->model & CM_INLINE)
{
if (checkstack && !node->implicit)
{
checkstack = no;
if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
{
if ( TY_(InlineDup)(doc, node) > 0 )
continue;
}
}
mode = MixedContent;
}
else
{
checkstack = yes;
mode = IgnoreWhitespace;
}
/* trim white space before
*/
if ( nodeIsBR(node) )
TrimSpaces( doc, element );
TY_(InsertNodeAtEnd)(element, node);
if (node->implicit)
TY_(Report)(doc, element, node, INSERTING_TAG );
/* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an
effort has been made above to set a 'MixedContent' mode in some cases?
WHY IS THE 'mode' VARIABLE NOT USED HERE???? */
ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
continue;
}
/* discard unexpected tags */
if (node->type == EndTag)
TY_(PopInline)( doc, node ); /* if inline end tag */
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
continue;
}
if (!(element->tag->model & CM_OPT))
TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR);
if (element->tag->model & CM_OBJECT)
{
/* pop inline stack */
while ( lexer->istacksize > lexer->istackbase )
TY_(PopInline)( doc, NULL );
lexer->istackbase = istackbase;
}
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_block--;
SPRTF("Exit ParseBlock 10 %d...\n",in_parse_block);
#endif
return NULL;
}
/** MARK: TY_(oldParseInline)
* Parse inline element nodes.
*/
void* TY_(oldParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
{
#if defined(ENABLE_DEBUG_LOG)
static int in_parse_inline = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node, *parent;
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline++;
SPRTF("Entering ParseInline %d...\n",in_parse_inline);
#endif
if (element->tag->model & CM_EMPTY) {
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 1 %d...\n",in_parse_inline);
#endif
return NULL;
}
/*
ParseInline is used for some block level elements like H1 to H6
For such elements we need to insert inline emphasis tags currently
on the inline stack. For Inline elements, we normally push them
onto the inline stack provided they aren't implicit or OBJECT/APPLET.
This test is carried out in PushInline and PopInline, see istack.c
InlineDup(...) is not called for elements with a CM_MIXED (inline and
block) content model, e.g. or , otherwise constructs like
111222333444555
111222333444555
111222333444555
will get corrupted.
*/
if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) &&
!TY_(nodeHasCM)(element, CM_MIXED))
TY_(InlineDup)(doc, NULL);
else if (TY_(nodeHasCM)(element, CM_INLINE))
TY_(PushInline)(doc, element);
if ( nodeIsNOBR(element) )
doc->badLayout |= USING_NOBR;
else if ( nodeIsFONT(element) )
doc->badLayout |= USING_FONT;
/* Inline elements may or may not be within a preformatted element */
if (mode != Preformatted)
mode = MixedContent;
while ((node = TY_(GetToken)(doc, mode)) != NULL)
{
/* end tag for current element */
if (node->tag == element->tag && node->type == EndTag)
{
if (element->tag->model & CM_INLINE)
TY_(PopInline)( doc, node );
TY_(FreeNode)( doc, node );
if (!(mode & Preformatted))
TrimSpaces(doc, element);
/*
if a font element wraps an anchor and nothing else
then move the font element inside the anchor since
otherwise it won't alter the anchor text color
*/
if ( nodeIsFONT(element) &&
element->content && element->content == element->last )
{
Node *child = element->content;
if ( nodeIsA(child) )
{
child->parent = element->parent;
child->next = element->next;
child->prev = element->prev;
element->next = NULL;
element->prev = NULL;
element->parent = child;
element->content = child->content;
element->last = child->last;
child->content = element;
TY_(FixNodeLinks)(child);
TY_(FixNodeLinks)(element);
}
}
element->closed = yes;
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 2 %d...\n",in_parse_inline);
#endif
return NULL;
}
/* ... map 2nd to if 1st is explicit */
/* (see additional conditions below) */
/* otherwise emphasis nesting is probably unintentional */
/* big, small, sub, sup have cumulative effect to leave them alone */
if ( node->type == StartTag
&& node->tag == element->tag
&& TY_(IsPushed)( doc, node )
&& !node->implicit
&& !element->implicit
&& node->tag && (node->tag->model & CM_INLINE)
&& !nodeIsA(node)
&& !nodeIsFONT(node)
&& !nodeIsBIG(node)
&& !nodeIsSMALL(node)
&& !nodeIsSUB(node)
&& !nodeIsSUP(node)
&& !nodeIsQ(node)
&& !nodeIsSPAN(node)
&& cfgBool(doc, TidyCoerceEndTags)
)
{
/* proceeds only if "node" does not have any attribute and
follows a text node not finishing with a space */
if (element->content != NULL && node->attributes == NULL
&& TY_(nodeIsText)(element->last)
&& !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) )
{
TY_(Report)(doc, element, node, COERCE_TO_ENDTAG);
node->type = EndTag;
TY_(UngetToken)(doc);
continue;
}
if (node->attributes == NULL || element->attributes == NULL)
TY_(Report)(doc, element, node, NESTED_EMPHASIS);
}
else if ( TY_(IsPushed)(doc, node) && node->type == StartTag &&
nodeIsQ(node) )
{
/*\
* Issue #215 - such nested quotes are NOT a problem if HTML5, so
* only issue this warning if NOT HTML5 mode.
\*/
if (TY_(HTMLVersion)(doc) != HT50)
{
TY_(Report)(doc, element, node, NESTED_QUOTATION);
}
}
if ( TY_(nodeIsText)(node) )
{
/* only called for 1st child */
if ( element->content == NULL && !(mode & Preformatted) )
TrimSpaces( doc, element );
if ( node->start >= node->end )
{
TY_(FreeNode)( doc, node );
continue;
}
TY_(InsertNodeAtEnd)(element, node);
continue;
}
/* mixed content model so allow text */
if (InsertMisc(element, node))
continue;
/* deal with HTML tags */
if ( nodeIsHTML(node) )
{
if ( TY_(nodeIsElement)(node) )
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node );
continue;
}
/* otherwise infer end of inline element */
TY_(UngetToken)( doc );
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 3 %d...\n",in_parse_inline);
#endif
return NULL;
}
/* within or map to
*/
if ( nodeIsP(node) &&
node->type == StartTag &&
( (mode & Preformatted) ||
nodeIsDT(element) ||
DescendantOf(element, TidyTag_DT )
)
)
{
node->tag = TY_(LookupTagDef)( TidyTag_BR );
TidyDocFree(doc, node->element);
node->element = TY_(tmbstrdup)(doc->allocator, "br");
TrimSpaces(doc, element);
TY_(InsertNodeAtEnd)(element, node);
continue;
}
/*
allowed within
in HTML 4.01 Transitional */
if ( nodeIsP(node) &&
node->type == StartTag &&
nodeIsADDRESS(element) )
{
TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
TY_(InsertNodeAtEnd)(element, node);
(*node->tag->parser)( doc, node, mode, no );
continue;
}
/* ignore unknown and PARAM tags */
if ( node->tag == NULL || nodeIsPARAM(node) )
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node );
continue;
}
if ( nodeIsBR(node) && node->type == EndTag )
node->type = StartTag;
if ( node->type == EndTag )
{
/* coerce to
*/
if ( nodeIsBR(node) )
node->type = StartTag;
else if ( nodeIsP(node) )
{
/* coerce unmatched
to
*/
if ( !DescendantOf(element, TidyTag_P) )
{
TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
TrimSpaces( doc, element );
TY_(InsertNodeAtEnd)( element, node );
node = TY_(InferredTag)(doc, TidyTag_BR);
TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */
continue;
}
}
else if ( TY_(nodeHasCM)(node, CM_INLINE)
&& !nodeIsA(node)
&& !TY_(nodeHasCM)(node, CM_OBJECT)
&& TY_(nodeHasCM)(element, CM_INLINE) )
{
/* allow any inline end tag to end current element */
/* http://tidy.sf.net/issue/1426419 */
/* but, like the browser, retain an earlier inline element.
This is implemented by setting the lexer into a mode
where it gets tokens from the inline stack rather than
from the input stream. Check if the scenerio fits. */
if ( !nodeIsA(element)
&& (node->tag != element->tag)
&& TY_(IsPushed)( doc, node )
&& TY_(IsPushed)( doc, element ) )
{
/* we have something like
bold bold and italic italics */
if ( TY_(SwitchInline)( doc, element, node ) )
{
TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG);
TY_(UngetToken)( doc ); /* put this back */
TY_(InlineDup1)( doc, NULL, element ); /* dupe the , after */
if (!(mode & Preformatted))
TrimSpaces( doc, element );
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 4 %d...\n",in_parse_inline);
#endif
return NULL; /* close , but will re-open it, after */
}
}
TY_(PopInline)( doc, element );
if ( !nodeIsA(element) )
{
if ( nodeIsA(node) && node->tag != element->tag )
{
TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
TY_(UngetToken)( doc );
}
else
{
TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG);
TY_(FreeNode)( doc, node);
}
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 5 %d...\n",in_parse_inline);
#endif
return NULL;
}
/* if parent is then discard unexpected inline end tag */
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
} /* special case etc. for stuff moved in front of table */
else if ( lexer->exiled
&& (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
{
TY_(UngetToken)( doc );
TrimSpaces(doc, element);
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 6 %d...\n",in_parse_inline);
#endif
return NULL;
}
}
/* allow any header tag to end current header */
if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) )
{
if ( node->tag == element->tag )
{
TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG );
TY_(FreeNode)( doc, node);
}
else
{
TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
TY_(UngetToken)( doc );
}
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 7 %d...\n",in_parse_inline);
#endif
return NULL;
}
/*
an tag to ends any open element
but is mapped to
*/
/* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
/* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
if ( nodeIsA(node) && !node->implicit &&
(nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
{
/* coerce to unless it has some attributes */
/* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
/* other fixes by Dave Raggett */
/* if (node->attributes == NULL) */
if (node->type != EndTag && node->attributes == NULL
&& cfgBool(doc, TidyCoerceEndTags) )
{
node->type = EndTag;
TY_(Report)(doc, element, node, COERCE_TO_ENDTAG);
/* TY_(PopInline)( doc, node ); */
TY_(UngetToken)( doc );
continue;
}
TY_(UngetToken)( doc );
TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
/* TY_(PopInline)( doc, element ); */
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 8 %d...\n",in_parse_inline);
#endif
return NULL;
}
if (element->tag->model & CM_HEADING)
{
if ( nodeIsCENTER(node) || nodeIsDIV(node) )
{
if (!TY_(nodeIsElement)(node))
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN);
/* insert center as parent if heading is empty */
if (element->content == NULL)
{
InsertNodeAsParent(element, node);
continue;
}
/* split heading and make center parent of 2nd part */
TY_(InsertNodeAfterElement)(element, node);
if (!(mode & Preformatted))
TrimSpaces(doc, element);
element = TY_(CloneNode)( doc, element );
TY_(InsertNodeAtEnd)(node, element);
continue;
}
if ( nodeIsHR(node) )
{
if ( !TY_(nodeIsElement)(node) )
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN);
/* insert hr before heading if heading is empty */
if (element->content == NULL)
{
TY_(InsertNodeBeforeElement)(element, node);
continue;
}
/* split heading and insert hr before 2nd part */
TY_(InsertNodeAfterElement)(element, node);
if (!(mode & Preformatted))
TrimSpaces(doc, element);
element = TY_(CloneNode)( doc, element );
TY_(InsertNodeAfterElement)(node, element);
continue;
}
}
if ( nodeIsDT(element) )
{
if ( nodeIsHR(node) )
{
Node *dd;
if ( !TY_(nodeIsElement)(node) )
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN);
dd = TY_(InferredTag)(doc, TidyTag_DD);
/* insert hr within dd before dt if dt is empty */
if (element->content == NULL)
{
TY_(InsertNodeBeforeElement)(element, dd);
TY_(InsertNodeAtEnd)(dd, node);
continue;
}
/* split dt and insert hr within dd before 2nd part */
TY_(InsertNodeAfterElement)(element, dd);
TY_(InsertNodeAtEnd)(dd, node);
if (!(mode & Preformatted))
TrimSpaces(doc, element);
element = TY_(CloneNode)( doc, element );
TY_(InsertNodeAfterElement)(dd, element);
continue;
}
}
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if (node->type == EndTag)
{
for (parent = element->parent;
parent != NULL; parent = parent->parent)
{
if (node->tag == parent->tag)
{
if (!(element->tag->model & CM_OPT) && !element->implicit)
TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
if( TY_(IsPushedLast)( doc, element, node ) )
TY_(PopInline)( doc, element );
TY_(UngetToken)( doc );
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 9 %d...\n",in_parse_inline);
#endif
return NULL;
}
}
}
/*\
* block level tags end this element
* Issue #333 - There seems an exception if the element is a 'span',
* and the node just collected is a 'meta'. The 'meta' can not have
* CM_INLINE added, nor can the 'span' have CM_MIXED added without
* big consequences.
* There may be other exceptions to be added...
\*/
if (!(node->tag->model & CM_INLINE) &&
!(element->tag->model & CM_MIXED) &&
!(nodeIsSPAN(element) && nodeIsMETA(node)) )
{
if ( !TY_(nodeIsElement)(node) )
{
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/* HTML5 */
if (nodeIsDATALIST(element)) {
TY_(ConstrainVersion)( doc, ~VERS_HTML5 );
} else
if (!(element->tag->model & CM_OPT))
TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
{
MoveToHead(doc, element, node);
continue;
}
/*
prevent anchors from propagating into block tags
except for headings h1 to h6
*/
if ( nodeIsA(element) )
{
if (node->tag && !(node->tag->model & CM_HEADING))
TY_(PopInline)( doc, element );
else if (!(element->content))
{
TY_(DiscardElement)( doc, element );
TY_(UngetToken)( doc );
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 10 %d...\n",in_parse_inline);
#endif
return NULL;
}
}
TY_(UngetToken)( doc );
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 11 %d...\n",in_parse_inline);
#endif
return NULL;
}
/* parse inline element */
if (TY_(nodeIsElement)(node))
{
if (node->implicit)
TY_(Report)(doc, element, node, INSERTING_TAG);
/* trim white space before
*/
if ( nodeIsBR(node) )
TrimSpaces(doc, element);
TY_(InsertNodeAtEnd)(element, node);
ParseTag(doc, node, mode);
continue;
}
/* discard unexpected tags */
TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node );
continue;
}
if (!(element->tag->model & CM_OPT))
TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR);
#if defined(ENABLE_DEBUG_LOG)
in_parse_inline--;
SPRTF("Exit ParseInline 12 %d...\n",in_parse_inline);
#endif
return NULL;
}
/** MARK: TY_(oldParseDefList)
* Parses the `dl` tag.
*/
void* TY_(oldParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode)
{
Lexer* lexer = doc->lexer;
Node *node, *parent;
if (list->tag->model & CM_EMPTY)
return NULL;
lexer->insert = NULL; /* defer implicit inline start tags */
while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == list->tag && node->type == EndTag)
{
TY_(FreeNode)( doc, node);
list->closed = yes;
return NULL;
}
/* deal with comments etc. */
if (InsertMisc(list, node))
continue;
if (TY_(nodeIsText)(node))
{
TY_(UngetToken)( doc );
node = TY_(InferredTag)(doc, TidyTag_DT);
TY_(Report)(doc, list, node, MISSING_STARTTAG);
}
if (node->tag == NULL)
{
TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if (node->type == EndTag)
{
Bool discardIt = no;
if ( nodeIsFORM(node) )
{
BadForm( doc );
TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node );
continue;
}
for (parent = list->parent;
parent != NULL; parent = parent->parent)
{
/* Do not match across BODY to avoid infinite loop
between ParseBody and this parser,
See http://tidy.sf.net/bug/1098012. */
if (nodeIsBODY(parent))
{
discardIt = yes;
break;
}
if (node->tag == parent->tag)
{
TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE);
TY_(UngetToken)( doc );
return NULL;
}
}
if (discardIt)
{
TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
}
/* center in a dt or a dl breaks the dl list in two */
if ( nodeIsCENTER(node) )
{
if (list->content)
TY_(InsertNodeAfterElement)(list, node);
else /* trim empty dl list */
{
TY_(InsertNodeBeforeElement)(list, node);
}
/* #426885 - fix by Glenn Carroll 19 Apr 00, and
Gary Dechaines 11 Aug 00 */
/* ParseTag can destroy node, if it finds that
* this is followed immediately by .
* It's awkward but necessary to determine if this
* has happened.
*/
parent = node->parent;
/* and parse contents of center */
lexer->excludeBlocks = no;
ParseTag( doc, node, mode);
lexer->excludeBlocks = yes;
/* now create a new dl element,
* unless node has been blown away because the
* center was empty, as above.
*/
if (parent->last == node)
{
list = TY_(InferredTag)(doc, TidyTag_DL);
TY_(InsertNodeAfterElement)(node, list);
}
continue;
}
if ( !(nodeIsDT(node) || nodeIsDD(node)) )
{
TY_(UngetToken)( doc );
if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
{
TY_(Report)(doc, list, node, TAG_NOT_ALLOWED_IN);
return NULL;
}
/* if DD appeared directly in BODY then exclude blocks */
if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
return NULL;
node = TY_(InferredTag)(doc, TidyTag_DD);
TY_(Report)(doc, list, node, MISSING_STARTTAG);
}
if (node->type == EndTag)
{
TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/* node should be or */
TY_(InsertNodeAtEnd)(list, node);
ParseTag( doc, node, IgnoreWhitespace);
}
TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR);
return NULL;
}
/** MARK: TY_(oldParseList)
* Parses list tags.
*/
void* TY_(oldParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode))
{
#if defined(ENABLE_DEBUG_LOG)
static int in_parse_list = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node, *parent, *lastli;
Bool wasblock;
Bool nodeisOL = nodeIsOL(list);
#if defined(ENABLE_DEBUG_LOG)
in_parse_list++;
SPRTF("Entering ParseList %d...\n",in_parse_list);
#endif
if (list->tag->model & CM_EMPTY)
{
#if defined(ENABLE_DEBUG_LOG)
in_parse_list--;
SPRTF("Exit ParseList 1 %d... CM_EMPTY\n",in_parse_list);
#endif
return NULL;
}
lexer->insert = NULL; /* defer implicit inline start tags */
while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
{
Bool foundLI = no;
if (node->tag == list->tag && node->type == EndTag)
{
TY_(FreeNode)( doc, node);
list->closed = yes;
#if defined(ENABLE_DEBUG_LOG)
in_parse_list--;
SPRTF("Exit ParseList 2 %d... Endtag\n",in_parse_list);
#endif
return NULL;
}
/* deal with comments etc. */
if (InsertMisc(list, node))
continue;
if (node->type != TextNode && node->tag == NULL)
{
TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
if (lexer && (node->type == TextNode))
{
uint ch, ix = node->start;
/* Issue #572 - Skip whitespace. */
while (ix < node->end && (ch = (lexer->lexbuf[ix] & 0xff))
&& (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'))
++ix;
if (ix >= node->end)
{
/* Issue #572 - Discard if ALL whitespace. */
TY_(FreeNode)(doc, node);
continue;
}
}
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if (node->type == EndTag)
{
if ( nodeIsFORM(node) )
{
BadForm( doc );
TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node );
continue;
}
if (TY_(nodeHasCM)(node,CM_INLINE))
{
TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
TY_(PopInline)( doc, node );
TY_(FreeNode)( doc, node);
continue;
}
for ( parent = list->parent;
parent != NULL; parent = parent->parent )
{
/* Do not match across BODY to avoid infinite loop
between ParseBody and this parser,
See http://tidy.sf.net/bug/1053626. */
if (nodeIsBODY(parent))
break;
if (node->tag == parent->tag)
{
TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE);
TY_(UngetToken)( doc );
#if defined(ENABLE_DEBUG_LOG)
in_parse_list--;
SPRTF("Exit ParseList 3 %d... No End Tag\n",in_parse_list);
#endif
return NULL;
}
}
TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
if ( !nodeIsLI(node) && nodeisOL )
{
/* Issue #572 - A - can have nested
elements */
foundLI = FindLastLI(list, &lastli); /* find last - */
}
if ( nodeIsLI(node) || (TY_(IsHTML5Mode)(doc) && !foundLI) )
{
/* node is
- OR
Issue #396 - A
can have Zero or more - elements
*/
TY_(InsertNodeAtEnd)(list,node);
}
else
{
TY_(UngetToken)( doc );
if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks)
{
TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE);
#if defined(ENABLE_DEBUG_LOG)
in_parse_list--;
SPRTF("Exit ParseList 4 %d... No End Tag\n",in_parse_list);
#endif
return NULL;
}
/* http://tidy.sf.net/issue/1316307 */
/* In exiled mode, return so table processing can continue. */
else if ( lexer->exiled
&& (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW)
|| nodeIsTABLE(node)) )
{
#if defined(ENABLE_DEBUG_LOG)
in_parse_list--;
SPRTF("Exit ParseList 5 %d... exiled\n",in_parse_list);
#endif
return NULL;
}
/* http://tidy.sf.net/issue/836462
If "list" is an unordered list, insert the next tag within
the last
- to preserve the numbering to match the visual
rendering of most browsers. */
if ( nodeIsOL(list) && FindLastLI(list, &lastli) )
{
/* Create a node for error reporting */
node = TY_(InferredTag)(doc, TidyTag_LI);
TY_(Report)(doc, list, node, MISSING_STARTTAG );
TY_(FreeNode)( doc, node);
node = lastli;
}
else
{
/* Add an inferred
- */
wasblock = TY_(nodeHasCM)(node,CM_BLOCK);
node = TY_(InferredTag)(doc, TidyTag_LI);
/* Add "display: inline" to avoid a blank line after
- with
Internet Explorer. See http://tidy.sf.net/issue/836462 */
TY_(AddStyleProperty)( doc, node,
wasblock
? "list-style: none; display: inline"
: "list-style: none"
);
TY_(Report)(doc, list, node, MISSING_STARTTAG );
TY_(InsertNodeAtEnd)(list,node);
}
}
ParseTag( doc, node, IgnoreWhitespace);
}
TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR);
#if defined(ENABLE_DEBUG_LOG)
in_parse_list--;
SPRTF("Exit ParseList 6 %d... missing end tag\n",in_parse_list);
#endif
return NULL;
}
/** MARK: TY_(oldParseRow)
* Parses the `row` tag.
*/
void* TY_(oldParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode))
{
Lexer* lexer = doc->lexer;
Node *node;
Bool exclude_state;
if (row->tag->model & CM_EMPTY)
return NULL;
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == row->tag)
{
if (node->type == EndTag)
{
TY_(FreeNode)( doc, node);
row->closed = yes;
FixEmptyRow( doc, row);
return NULL;
}
/* New row start implies end of current row */
TY_(UngetToken)( doc );
FixEmptyRow( doc, row);
return NULL;
}
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if ( node->type == EndTag )
{
if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node))
&& DescendantOf(row, TagId(node)) )
{
TY_(UngetToken)( doc );
return NULL;
}
if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
{
if ( nodeIsFORM(node) )
BadForm( doc );
TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
if ( nodeIsTD(node) || nodeIsTH(node) )
{
TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
}
/* deal with comments etc. */
if (InsertMisc(row, node))
continue;
/* discard unknown tags */
if (node->tag == NULL && node->type != TextNode)
{
TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/* discard unexpected
element */
if ( nodeIsTABLE(node) )
{
TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/* THEAD, TFOOT or TBODY */
if ( TY_(nodeHasCM)(node, CM_ROWGRP) )
{
TY_(UngetToken)( doc );
return NULL;
}
if (node->type == EndTag)
{
TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/*
if text or inline or block move before table
if head content move to head
*/
if (node->type != EndTag)
{
if ( nodeIsFORM(node) )
{
TY_(UngetToken)( doc );
node = TY_(InferredTag)(doc, TidyTag_TD);
TY_(Report)(doc, row, node, MISSING_STARTTAG);
}
else if ( TY_(nodeIsText)(node)
|| TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) )
{
MoveBeforeTable( doc, row, node );
TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN);
lexer->exiled = yes;
exclude_state = lexer->excludeBlocks;
lexer->excludeBlocks = no;
if (node->type != TextNode)
ParseTag( doc, node, IgnoreWhitespace);
lexer->exiled = no;
lexer->excludeBlocks = exclude_state;
continue;
}
else if (node->tag->model & CM_HEAD)
{
TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN);
MoveToHead( doc, row, node);
continue;
}
}
if ( !(nodeIsTD(node) || nodeIsTH(node)) )
{
TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN);
TY_(FreeNode)( doc, node);
continue;
}
/* node should be or | */
TY_(InsertNodeAtEnd)(row, node);
exclude_state = lexer->excludeBlocks;
lexer->excludeBlocks = no;
ParseTag( doc, node, IgnoreWhitespace);
lexer->excludeBlocks = exclude_state;
/* pop inline stack */
while ( lexer->istacksize > lexer->istackbase )
TY_(PopInline)( doc, NULL );
}
return NULL;
}
/** MARK: TY_(oldParseRowGroup)
* Parses the `rowgroup` tag.
*/
void* TY_(oldParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode))
{
Lexer* lexer = doc->lexer;
Node *node, *parent;
if (rowgroup->tag->model & CM_EMPTY)
return NULL;
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == rowgroup->tag)
{
if (node->type == EndTag)
{
rowgroup->closed = yes;
TY_(FreeNode)( doc, node);
return NULL;
}
TY_(UngetToken)( doc );
return NULL;
}
/* if |
infer end tag */
if ( nodeIsTABLE(node) && node->type == EndTag )
{
TY_(UngetToken)( doc );
return NULL;
}
/* deal with comments etc. */
if (InsertMisc(rowgroup, node))
continue;
/* discard unknown tags */
if (node->tag == NULL && node->type != TextNode)
{
TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/*
if TD or TH then infer
if text or inline or block move before table
if head content move to head
*/
if (node->type != EndTag)
{
if ( nodeIsTD(node) || nodeIsTH(node) )
{
TY_(UngetToken)( doc );
node = TY_(InferredTag)(doc, TidyTag_TR);
TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG);
}
else if ( TY_(nodeIsText)(node)
|| TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
{
MoveBeforeTable( doc, rowgroup, node );
TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
lexer->exiled = yes;
if (node->type != TextNode)
ParseTag(doc, node, IgnoreWhitespace);
lexer->exiled = no;
continue;
}
else if (node->tag->model & CM_HEAD)
{
TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
MoveToHead(doc, rowgroup, node);
continue;
}
}
/*
if this is the end tag for ancestor element
then infer end tag for this element
*/
if (node->type == EndTag)
{
if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
{
if ( nodeIsFORM(node) )
BadForm( doc );
TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) )
{
TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
for ( parent = rowgroup->parent;
parent != NULL;
parent = parent->parent )
{
if (node->tag == parent->tag)
{
TY_(UngetToken)( doc );
return NULL;
}
}
}
/*
if THEAD, TFOOT or TBODY then implied end tag
*/
if (node->tag->model & CM_ROWGRP)
{
if (node->type != EndTag)
{
TY_(UngetToken)( doc );
return NULL;
}
}
if (node->type == EndTag)
{
TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
if ( !nodeIsTR(node) )
{
node = TY_(InferredTag)(doc, TidyTag_TR);
TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG);
TY_(UngetToken)( doc );
}
/* node should be
*/
TY_(InsertNodeAtEnd)(rowgroup, node);
ParseTag(doc, node, IgnoreWhitespace);
}
return NULL;
}
/** MARK: TY_(oldParseColGroup)
* Parses the `colgroup` tag.
*/
void* TY_(oldParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode))
{
Node *node, *parent;
if (colgroup->tag->model & CM_EMPTY)
return NULL;
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == colgroup->tag && node->type == EndTag)
{
TY_(FreeNode)( doc, node);
colgroup->closed = yes;
return NULL;
}
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if (node->type == EndTag)
{
if ( nodeIsFORM(node) )
{
BadForm( doc );
TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
for ( parent = colgroup->parent;
parent != NULL;
parent = parent->parent )
{
if (node->tag == parent->tag)
{
TY_(UngetToken)( doc );
return NULL;
}
}
}
if (TY_(nodeIsText)(node))
{
TY_(UngetToken)( doc );
return NULL;
}
/* deal with comments etc. */
if (InsertMisc(colgroup, node))
continue;
/* discard unknown tags */
if (node->tag == NULL)
{
TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
if ( !nodeIsCOL(node) )
{
TY_(UngetToken)( doc );
return NULL;
}
if (node->type == EndTag)
{
TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/* node should be */
TY_(InsertNodeAtEnd)(colgroup, node);
ParseTag(doc, node, IgnoreWhitespace);
}
return NULL;
}
/** MARK: TY_(oldParseTableTag)
* Parses the `table` tag.
*/
void* TY_(oldParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode))
{
#if defined(ENABLE_DEBUG_LOG)
static int in_parse_table = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node, *parent;
uint istackbase;
TY_(DeferDup)( doc );
istackbase = lexer->istackbase;
lexer->istackbase = lexer->istacksize;
#if defined(ENABLE_DEBUG_LOG)
in_parse_table++;
SPRTF("Entering ParseTableTag %d...\n",in_parse_table);
#endif
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == table->tag )
{
if (node->type == EndTag)
{
TY_(FreeNode)(doc, node);
}
else
{
/* Issue #498 - If a in a
* just close the current table, and issue a
* warning. The previous action was to discard
* this second
*/
TY_(UngetToken)(doc);
TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN);
}
lexer->istackbase = istackbase;
table->closed = yes;
#if defined(ENABLE_DEBUG_LOG)
in_parse_table--;
SPRTF("Exit ParseTableTag 1 %d... EndTag\n",in_parse_table);
#endif
return NULL;
}
/* deal with comments etc. */
if (InsertMisc(table, node))
continue;
/* discard unknown tags */
if (node->tag == NULL && node->type != TextNode)
{
TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/* if TD or TH or text or inline or block then infer */
if (node->type != EndTag)
{
if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) )
{
TY_(UngetToken)( doc );
node = TY_(InferredTag)(doc, TidyTag_TR);
TY_(Report)(doc, table, node, MISSING_STARTTAG);
}
else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) )
{
TY_(InsertNodeBeforeElement)(table, node);
TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN);
lexer->exiled = yes;
if (node->type != TextNode)
ParseTag(doc, node, IgnoreWhitespace);
lexer->exiled = no;
continue;
}
else if (node->tag->model & CM_HEAD)
{
MoveToHead(doc, table, node);
continue;
}
}
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if (node->type == EndTag)
{
if ( nodeIsFORM(node) )
{
BadForm( doc );
TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/* best to discard unexpected block/inline end tags */
if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) ||
TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
{
TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
for ( parent = table->parent;
parent != NULL;
parent = parent->parent )
{
if (node->tag == parent->tag)
{
TY_(Report)(doc, table, node, MISSING_ENDTAG_BEFORE );
TY_(UngetToken)( doc );
lexer->istackbase = istackbase;
#if defined(ENABLE_DEBUG_LOG)
in_parse_table--;
SPRTF("Exit ParseTableTag 2 %d... missing EndTag\n",in_parse_table);
#endif
return NULL;
}
}
}
if (!(node->tag->model & CM_TABLE))
{
TY_(UngetToken)( doc );
TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN);
lexer->istackbase = istackbase;
#if defined(ENABLE_DEBUG_LOG)
in_parse_table--;
SPRTF("Exit ParseTableTag 3 %d... CM_TABLE\n",in_parse_table);
#endif
return NULL;
}
if (TY_(nodeIsElement)(node))
{
TY_(InsertNodeAtEnd)(table, node);
ParseTag(doc, node, IgnoreWhitespace);
continue;
}
/* discard unexpected text nodes and end tags */
TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
}
TY_(Report)(doc, table, node, MISSING_ENDTAG_FOR);
lexer->istackbase = istackbase;
#if defined(ENABLE_DEBUG_LOG)
in_parse_table--;
SPRTF("Exit ParseTableTag 4 %d... missing end\n",in_parse_table);
#endif
return NULL;
}
/** MARK: TY_(oldParsePre)
* Parses the `pre` tag.
*/
void* TY_(oldParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) )
{
Node *node;
if (pre->tag->model & CM_EMPTY)
return NULL;
TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
while ((node = TY_(GetToken)(doc, Preformatted)) != NULL)
{
if ( node->type == EndTag &&
(node->tag == pre->tag || DescendantOf(pre, TagId(node))) )
{
if (nodeIsBODY(node) || nodeIsHTML(node))
{
TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)(doc, node);
continue;
}
if (node->tag == pre->tag)
{
TY_(FreeNode)(doc, node);
}
else
{
TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE );
TY_(UngetToken)( doc );
}
pre->closed = yes;
TrimSpaces(doc, pre);
return NULL;
}
if (TY_(nodeIsText)(node))
{
TY_(InsertNodeAtEnd)(pre, node);
continue;
}
/* deal with comments etc. */
if (InsertMisc(pre, node))
continue;
if (node->tag == NULL)
{
TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)(doc, node);
continue;
}
/* strip unexpected tags */
if ( !PreContent(doc, node) )
{
Node *newnode;
/* fix for http://tidy.sf.net/bug/772205 */
if (node->type == EndTag)
{
/* http://tidy.sf.net/issue/1590220 */
if ( doc->lexer->exiled
&& (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
{
TY_(UngetToken)(doc);
TrimSpaces(doc, pre);
return NULL;
}
TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)(doc, node);
continue;
}
/* http://tidy.sf.net/issue/1590220 */
else if (TY_(nodeHasCM)(node, CM_TABLE|CM_ROW)
|| nodeIsTABLE(node) )
{
if (!doc->lexer->exiled)
/* No missing close warning if exiled. */
TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE);
TY_(UngetToken)(doc);
return NULL;
}
/*
This is basically what Tidy 04 August 2000 did and far more accurate
with respect to browser behaivour than the code commented out above.
Tidy could try to propagate the into each disallowed child where
is allowed in order to replicate some browsers behaivour, but
there are a lot of exceptions, e.g. Internet Explorer does not propagate
into table cells while Mozilla does. Opera 6 never propagates
into blocklevel elements while Opera 7 behaves much like Mozilla.
Tidy behaves thus mostly like Opera 6 except for nested elements
which are handled like Mozilla takes them (Opera6 closes all after
the first
).
There are similar issues like replacing in
with
, for
example
......
(Input)
...
...
(Tidy)
...
...
(Opera 7 and Internet Explorer)
...
...
(Opera 6 and Mozilla)
......
...
(Input)
...
......
(Tidy, BUG!)
...
...
...
(Internet Explorer)
...
...
...
(Mozilla, Opera 6)
...
...
...
(Opera 7)
or something similar, they could also be closing the and propagate
the into the newly opened .
Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are
disallowed in
, Tidy neither detects this nor does it perform any
cleanup operation. Tidy should at least issue a warning if it encounters
such constructs.
Todo: discarding is abviously a bug, it should be replaced by
.
*/
TY_(InsertNodeAfterElement)(pre, node);
TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE);
ParseTag(doc, node, IgnoreWhitespace);
newnode = TY_(InferredTag)(doc, TidyTag_PRE);
TY_(Report)(doc, pre, newnode, INSERTING_TAG);
pre = newnode;
TY_(InsertNodeAfterElement)(node, pre);
continue;
}
if ( nodeIsP(node) )
{
if (node->type == StartTag)
{
TY_(Report)(doc, pre, node, USING_BR_INPLACE_OF);
/* trim white space before in
*/
TrimSpaces(doc, pre);
/* coerce both and
to
*/
TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
TY_(InsertNodeAtEnd)( pre, node );
}
else
{
TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
}
continue;
}
if ( TY_(nodeIsElement)(node) )
{
/* trim white space before
*/
if ( nodeIsBR(node) )
TrimSpaces(doc, pre);
TY_(InsertNodeAtEnd)(pre, node);
ParseTag(doc, node, Preformatted);
continue;
}
/* discard unexpected tags */
TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
}
TY_(Report)(doc, pre, node, MISSING_ENDTAG_FOR);
return NULL;
}
/** MARK: TY_(oldParseOptGroup)
* Parses the `optgroup` tag.
*/
void* TY_(oldParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
{
Lexer* lexer = doc->lexer;
Node *node;
lexer->insert = NULL; /* defer implicit inline start tags */
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == field->tag && node->type == EndTag)
{
TY_(FreeNode)( doc, node);
field->closed = yes;
TrimSpaces(doc, field);
return NULL;
}
/* deal with comments etc. */
if (InsertMisc(field, node))
continue;
if ( node->type == StartTag &&
(nodeIsOPTION(node) || nodeIsOPTGROUP(node)) )
{
if ( nodeIsOPTGROUP(node) )
TY_(Report)(doc, field, node, CANT_BE_NESTED);
TY_(InsertNodeAtEnd)(field, node);
ParseTag(doc, node, MixedContent);
continue;
}
/* discard unexpected tags */
TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED );
TY_(FreeNode)( doc, node);
}
return NULL;
}
/** MARK: TY_(oldParseSelect)
* Parses the `select` tag.
*/
void* TY_(oldParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
{
#if defined(ENABLE_DEBUG_LOG)
static int in_parse_select = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node;
lexer->insert = NULL; /* defer implicit inline start tags */
#if defined(ENABLE_DEBUG_LOG)
in_parse_select++;
SPRTF("Entering ParseSelect %d...\n",in_parse_select);
#endif
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == field->tag && node->type == EndTag)
{
TY_(FreeNode)( doc, node);
field->closed = yes;
TrimSpaces(doc, field);
#if defined(ENABLE_DEBUG_LOG)
in_parse_select--;
SPRTF("Exit ParseSelect 1 %d...\n",in_parse_select);
#endif
return NULL;
}
/* deal with comments etc. */
if (InsertMisc(field, node))
continue;
if ( node->type == StartTag &&
( nodeIsOPTION(node) ||
nodeIsOPTGROUP(node) ||
nodeIsDATALIST(node) ||
nodeIsSCRIPT(node))
)
{
TY_(InsertNodeAtEnd)(field, node);
ParseTag(doc, node, IgnoreWhitespace);
continue;
}
/* discard unexpected tags */
TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
}
TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR);
#if defined(ENABLE_DEBUG_LOG)
in_parse_select--;
SPRTF("Exit ParseSelect 2 %d...\n",in_parse_select);
#endif
return NULL;
}
/** MARK: TY_(oldParseDataList)
* Parses the `datalist` tag.
*/
void* TY_(oldParseDatalist)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
{
#if defined(ENABLE_DEBUG_LOG)
static int in_parse_datalist = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node;
lexer->insert = NULL; /* defer implicit inline start tags */
#if defined(ENABLE_DEBUG_LOG)
in_parse_datalist++;
SPRTF("Entering ParseDatalist %d...\n",in_parse_datalist);
#endif
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == field->tag && node->type == EndTag)
{
TY_(FreeNode)( doc, node);
field->closed = yes;
TrimSpaces(doc, field);
#if defined(ENABLE_DEBUG_LOG)
in_parse_datalist--;
SPRTF("Exit ParseDatalist 1 %d...\n",in_parse_datalist);
#endif
return NULL;
}
/* deal with comments etc. */
if (InsertMisc(field, node))
continue;
if ( node->type == StartTag &&
( nodeIsOPTION(node) ||
nodeIsOPTGROUP(node) ||
nodeIsDATALIST(node) ||
nodeIsSCRIPT(node))
)
{
TY_(InsertNodeAtEnd)(field, node);
ParseTag(doc, node, IgnoreWhitespace);
continue;
}
/* discard unexpected tags */
TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
}
TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR);
#if defined(ENABLE_DEBUG_LOG)
in_parse_datalist--;
SPRTF("Exit ParseDatalist 2 %d...\n",in_parse_datalist);
#endif
return NULL;
}
/** MARK: TY_(oldParseText)
* Parses the `option` and `textarea` tags.
*/
void* TY_(oldParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode)
{
Lexer* lexer = doc->lexer;
Node *node;
lexer->insert = NULL; /* defer implicit inline start tags */
if ( nodeIsTEXTAREA(field) )
mode = Preformatted;
else
mode = MixedContent; /* kludge for font tags */
while ((node = TY_(GetToken)(doc, mode)) != NULL)
{
if (node->tag == field->tag && node->type == EndTag)
{
TY_(FreeNode)( doc, node);
field->closed = yes;
TrimSpaces(doc, field);
return NULL;
}
/* deal with comments etc. */
if (InsertMisc(field, node))
continue;
if (TY_(nodeIsText)(node))
{
/* only called for 1st child */
if (field->content == NULL && !(mode & Preformatted))
TrimSpaces(doc, field);
if (node->start >= node->end)
{
TY_(FreeNode)( doc, node);
continue;
}
TY_(InsertNodeAtEnd)(field, node);
continue;
}
/* for textarea should all cases of < and & be escaped? */
/* discard inline tags e.g. font */
if ( node->tag
&& node->tag->model & CM_INLINE
&& !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */
{
TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/* terminate element on other tags */
if (!(field->tag->model & CM_OPT))
TY_(Report)(doc, field, node, MISSING_ENDTAG_BEFORE);
TY_(UngetToken)( doc );
TrimSpaces(doc, field);
return NULL;
}
if (!(field->tag->model & CM_OPT))
TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR);
return NULL;
}
/** MARK: TY_(oldParseTitle)
* Parses the `title` tag.
*/
void* TY_(oldParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode))
{
Node *node;
while ((node = TY_(GetToken)(doc, MixedContent)) != NULL)
{
if (node->tag == title->tag && node->type == StartTag
&& cfgBool(doc, TidyCoerceEndTags) )
{
TY_(Report)(doc, title, node, COERCE_TO_ENDTAG);
node->type = EndTag;
TY_(UngetToken)( doc );
continue;
}
else if (node->tag == title->tag && node->type == EndTag)
{
TY_(FreeNode)( doc, node);
title->closed = yes;
TrimSpaces(doc, title);
return NULL;
}
if (TY_(nodeIsText)(node))
{
/* only called for 1st child */
if (title->content == NULL)
TrimInitialSpace(doc, title, node);
if (node->start >= node->end)
{
TY_(FreeNode)( doc, node);
continue;
}
TY_(InsertNodeAtEnd)(title, node);
continue;
}
/* deal with comments etc. */
if (InsertMisc(title, node))
continue;
/* discard unknown tags */
if (node->tag == NULL)
{
TY_(Report)(doc, title, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
continue;
}
/* pushback unexpected tokens */
TY_(Report)(doc, title, node, MISSING_ENDTAG_BEFORE);
TY_(UngetToken)( doc );
TrimSpaces(doc, title);
return NULL;
}
TY_(Report)(doc, title, node, MISSING_ENDTAG_FOR);
return NULL;
}
/** MARK: TY_(oldParseScript)
* Parses the `script` tag.
*
* @todo This isn't quite right for CDATA content as it recognises tags
* within the content and parses them accordingly. This will unfortunately
* screw up scripts which include:
* < + letter
* < + !
* < + ?
* < + / + letter
*/
void* TY_(oldParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode))
{
Node *node;
doc->lexer->parent = script;
node = TY_(GetToken)(doc, CdataContent);
doc->lexer->parent = NULL;
if (node)
{
TY_(InsertNodeAtEnd)(script, node);
}
else
{
/* handle e.g. a document like "