...
......
...
*/ if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE)) return yes; /*...
... */ if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) && TY_(nodeIsElement)(node->prev)) return yes; /*...
*/ if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE)) return yes; return no; } /** * Indicates whether or not trailing whitespace should be cleaned. */ static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) { Node* next; if (!TY_(nodeIsText)(node)) return no; if (node->parent->type == DocTypeTag) return no; if (IsPreDescendant(node)) return no; if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) return no; next = node->next; /*...
*/ if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE)) return yes; /*...
...
*/ if (next->type == StartTag) return yes; /* ...*/ if (next->type == StartEndTag) return yes; /* evil adjacent text nodes, Tidy should not generate these :-( */ if (TY_(nodeIsText)(next) && next->start < next->end && TY_(IsWhite)(doc->lexer->lexbuf[next->start])) return yes; return no; } /***************************************************************************//* ** MARK: - Information Accumulation ***************************************************************************/ /** * Errors in positioning of form start or end tags * generally require human intervention to fix. * Issue #166 - repeated
hello world * to *
hello world
*
* Trims initial space, by moving it before the
* start tag, or if this element is the first in
* parent's content, then by discarding the space
*/
static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
{
Lexer* lexer = doc->lexer;
Node *prev, *node;
if ( TY_(nodeIsText)(text) &&
lexer->lexbuf[text->start] == ' ' &&
text->start < text->end )
{
if ( (element->tag->model & CM_INLINE) &&
!(element->tag->model & CM_FIELD) )
{
prev = element->prev;
if (TY_(nodeIsText)(prev))
{
if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
lexer->lexbuf[(prev->end)++] = ' ';
++(element->start);
}
else /* create new node */
{
node = TY_(NewNode)(lexer->allocator, lexer);
node->start = (element->start)++;
node->end = element->start;
lexer->lexbuf[node->start] = ' ';
TY_(InsertNodeBeforeElement)(element ,node);
DEBUG_LOG(SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n",
(element->element ? element->element : "unknown")));
}
}
/* discard the space in current node */
++(text->start);
}
}
/**
* This maps
* hello world
* to
* hello world
*
* If last child of element is a text node
* then trim trailing white space character
* moving it to after element's end tag.
*/
static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
{
Lexer* lexer = doc->lexer;
byte c;
if (TY_(nodeIsText)(last))
{
if (last->end > last->start)
{
c = (byte) lexer->lexbuf[ last->end - 1 ];
if ( c == ' ' )
{
last->end -= 1;
if ( (element->tag->model & CM_INLINE) &&
!(element->tag->model & CM_FIELD) )
lexer->insertspace = yes;
}
}
}
}
/**
* Move initial and trailing space out.
* This routine maps:
* hello world
* to
* hello world
* and
* hello world
* to
* hello world
*/
static void TrimSpaces( TidyDocImpl* doc, Node *element)
{
Node* text = element->content;
if (nodeIsPRE(element) || IsPreDescendant(element))
return;
if (TY_(nodeIsText)(text))
TrimInitialSpace(doc, element, text);
text = element->last;
if (TY_(nodeIsText)(text))
TrimTrailingSpace(doc, element, text);
}
/***************************************************************************//*
** MARK: - Parsers Support
***************************************************************************/
/**
* Structure used by FindDescendant_cb.
*/
struct MatchingDescendantData
{
Node *found_node;
Bool *passed_marker_node;
/* input: */
TidyTagId matching_tagId;
Node *node_to_find;
Node *marker_node;
};
/**
* The main engine for FindMatchingDescendant.
*/
static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate)
{
struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate;
if (TagId(node) == cb_data->matching_tagId)
{
/* make sure we match up 'unknown' tags exactly! */
if (cb_data->matching_tagId != TidyTag_UNKNOWN ||
(node->element != NULL &&
cb_data->node_to_find != NULL &&
cb_data->node_to_find->element != NULL &&
0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element)))
{
cb_data->found_node = node;
return ExitTraversal;
}
}
if (cb_data->passed_marker_node && node == cb_data->marker_node)
*cb_data->passed_marker_node = yes;
return VisitParent;
}
/**
* Search the parent chain (from `parent` upwards up to the root) for a node
* matching the given 'node'.
*
* When the search passes beyond the `marker_node` (which is assumed to sit
* in the parent chain), this will be flagged by setting the boolean
* referenced by `is_parent_of_marker` to `yes`.
*
* 'is_parent_of_marker' and 'marker_node' are optional parameters and may
* be NULL.
*/
static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker )
{
struct MatchingDescendantData cb_data = { 0 };
cb_data.matching_tagId = TagId(node);
cb_data.node_to_find = node;
cb_data.marker_node = marker_node;
assert(node);
if (is_parent_of_marker)
*is_parent_of_marker = no;
TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data);
return cb_data.found_node;
}
/**
* Finds the last list item for the given list, providing it in the
* in-out parameter. Returns yes or no if the item was the last list
* item.
*/
static Bool FindLastLI( Node *list, Node **lastli )
{
Node *node;
*lastli = NULL;
for ( node = list->content; node ; node = node->next )
if ( nodeIsLI(node) && node->type == StartTag )
*lastli=node;
return *lastli ? yes:no;
}
/***************************************************************************//*
** MARK: - Parser Stack
***************************************************************************/
/**
* Allocates and initializes the parser's stack.
*/
void TY_(InitParserStack)( TidyDocImpl* doc )
{
enum { default_size = 32 };
TidyParserMemory *content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * default_size );
doc->stack.content = content;
doc->stack.size = default_size;
doc->stack.top = -1;
}
/**
* Frees the parser's stack when done.
*/
void TY_(FreeParserStack)( TidyDocImpl* doc )
{
TidyFree( doc->allocator, doc->stack.content );
doc->stack.content = NULL;
doc->stack.size = 0;
doc->stack.top = -1;
}
/**
* Increase the stack size.
*/
static void growParserStack( TidyDocImpl* doc )
{
TidyParserMemory *content;
content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * doc->stack.size * 2 );
memcpy( content, doc->stack.content, sizeof(TidyParserMemory) * (doc->stack.top + 1) );
TidyFree(doc->allocator, doc->stack.content);
doc->stack.content = content;
doc->stack.size = doc->stack.size * 2;
}
/**
* Indicates whether or not the stack is empty.
*/
Bool TY_(isEmptyParserStack)( TidyDocImpl* doc )
{
return doc->stack.top < 0;
}
/**
* Peek at the parser memory.
*/
TidyParserMemory TY_(peekMemory)( TidyDocImpl* doc )
{
return doc->stack.content[doc->stack.top];
}
/**
* Peek at the parser memory "identity" field. This is just a convenience
* to avoid having to create a new struct instance in the caller.
*/
Parser* TY_(peekMemoryIdentity)( TidyDocImpl* doc )
{
return doc->stack.content[doc->stack.top].identity;
}
/**
* Peek at the parser memory "mode" field. This is just a convenience
* to avoid having to create a new struct instance in the caller.
*/
GetTokenMode TY_(peekMemoryMode)( TidyDocImpl* doc )
{
return doc->stack.content[doc->stack.top].mode;
}
/**
* Pop out a parser memory.
*/
TidyParserMemory TY_(popMemory)( TidyDocImpl* doc )
{
if ( !TY_(isEmptyParserStack)( doc ) )
{
TidyParserMemory data = doc->stack.content[doc->stack.top];
DEBUG_LOG(SPRTF("\n"
"<--POP original: %s @ %p\n"
" reentry: %s @ %p\n"
" stack depth: %lu @ %p\n"
" register 1: %i\n"
" register 2: %i\n\n",
data.original_node ? data.original_node->element : "none", data.original_node,
data.reentry_node ? data.reentry_node->element : "none", data.reentry_node,
doc->stack.top, &doc->stack.content[doc->stack.top],
data.register_1,
data.register_2
));
doc->stack.top = doc->stack.top - 1;
return data;
}
TidyParserMemory blank = { NULL };
return blank;
}
/**
* Push the parser memory to the stack.
*/
void TY_(pushMemory)( TidyDocImpl* doc, TidyParserMemory data )
{
if ( doc->stack.top == doc->stack.size - 1 )
growParserStack( doc );
doc->stack.top++;
doc->stack.content[doc->stack.top] = data;
DEBUG_LOG(SPRTF("\n"
"-->PUSH original: %s @ %p\n"
" reentry: %s @ %p\n"
" stack depth: %lu @ %p\n"
" register 1: %i\n"
" register 2: %i\n\n",
data.original_node ? data.original_node->element : "none", data.original_node,
data.reentry_node ? data.reentry_node->element : "none", data.reentry_node,
doc->stack.top, &doc->stack.content[doc->stack.top],
data.register_1,
data.register_2
));
}
/***************************************************************************//*
** MARK: - Parser Search and Instantiation
***************************************************************************/
/**
* Retrieves the correct parser for the given node, accounting for various
* conditions, and readies the lexer for parsing that node.
*/
static Parser* GetParserForNode( TidyDocImpl* doc, Node *node )
{
Lexer* lexer = doc->lexer;
if ( cfgBool( doc, TidyXmlTags ) )
return ParseXMLElement;
/* [i_a]2 prevent crash for active content (php, asp) docs */
if (!node || node->tag == NULL)
return NULL;
/*
Fix by GLP 2000-12-21. Need to reset insertspace if this is both
a non-inline and empty tag (base, link, meta, isindex, hr, area).
*/
if (node->tag->model & CM_EMPTY)
{
lexer->waswhite = no;
if (node->tag->parser == NULL)
return NULL;
}
else if (!(node->tag->model & CM_INLINE))
lexer->insertspace = no;
if (node->tag->parser == NULL)
return NULL;
if (node->type == StartEndTag)
return NULL;
/* [i_a]2 added this - not sure why - CHECKME: */
lexer->parent = node;
return (node->tag->parser);
}
/**
* This parser controller initiates the parsing process with the document's
* root starting with the provided node, which should be the HTML node after
* the pre-HTML stuff is handled at a higher level.
*
* This controller is responsible for calling each of the individual parsers,
* based on the tokens it pulls from the lexer, or the tokens passed back via
* the parserMemory stack from each of the parsers. Having a main, central
* looping dispatcher in this fashion allows the prevention of recursion.
*/
void ParseHTMLWithNode( TidyDocImpl* doc, Node* node )
{
GetTokenMode mode = IgnoreWhitespace;
Parser* parser = GetParserForNode( doc, node );
Bool something_to_do = yes;
/*
This main loop is only extinguished when all of the parser tokens are
consumed. Ideally, EVERY parser will return nodes to this loop for
dispatch to the appropriate parser, but some of the recursive parsers
still consume some tokens on their own.
*/
while (something_to_do)
{
node = parser ? parser( doc, node, mode ) : NULL;
/*
We have a node, so anything deferred was already pushed to the stack
to be dealt with later.
*/
if ( node )
{
parser = GetParserForNode( doc, node );
continue;
}
/*
We weren't given a node, which means this particular leaf is bottomed
out. We'll re-enter the parsers using information from the stack.
*/
if ( !TY_(isEmptyParserStack)(doc))
{
parser = TY_(peekMemoryIdentity)(doc);
if (parser)
{
continue;
}
else
{
/* No parser means we're only passing back a parsing mode. */
mode = TY_(peekMemoryMode)( doc );
TY_(popMemory)( doc );
}
}
/*
At this point, there's nothing being returned from parsers, and
nothing on the stack, so we can draw a new node from the lexer.
*/
node = TY_(GetToken)( doc, mode );
DEBUG_LOG(SPRTF("---ParseHTMLWithNode got token %s with mode %u.\n", node ? node->element : NULL, mode));
if (node)
parser = GetParserForNode( doc, node );
else
something_to_do = no;
}
}
/***************************************************************************//*
** MARK: - Parsers
***************************************************************************/
/** MARK: TY_(ParseBlock)
* `element` is a node created by the lexer upon seeing the start tag, or
* by the parser when the start tag is inferred
*
* This is a non-recursing parser. It uses the document's parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers.
* This parser is also re-enterable, so that post-processing can occur after
* such dispatching.
*/
Node* TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
{
#if defined(ENABLE_DEBUG_LOG)
static int in_parse_block = 0;
static int parse_block_cnt = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node;
Bool checkstack = yes;
uint istackbase = 0;
if ( element == NULL )
{
TidyParserMemory memory = TY_(popMemory)( doc );
node = memory.reentry_node; /* Throwaway, because the loop overwrites this immediately. */
mode = memory.reentry_mode;
element = memory.original_node;
DEBUG_LOG(SPRTF(">>>Re-Enter ParseBlock with %s\n", node->element));
}
else
{
DEBUG_LOG(SPRTF(">>>Entering ParseBlock %d... %d %s\n",++in_parse_block,++parse_block_cnt,
((element && element->element) ? element->element : "")));
if ( element->tag->model & CM_EMPTY )
{
DEBUG_LOG(SPRTF("<<
*/
if ( nodeIsBR(node) )
TrimSpaces( doc, element );
TY_(InsertNodeAtEnd)(element, node);
if (node->implicit)
TY_(Report)(doc, element, node, INSERTING_TAG );
/* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an
effort has been made above to set a 'MixedContent' mode in some cases?
WHY IS THE 'mode' VARIABLE NOT USED HERE???? */
{
TidyParserMemory memory = {0};
memory.identity = TY_(ParseBlock);
memory.reentry_node = node;
memory.reentry_mode = mode;
memory.original_node = element;
TY_(pushMemory)(doc, memory);
DEBUG_LOG(SPRTF("<<