2011-11-17 02:44:16 +00:00
/* parser.c -- HTML Parser
( c ) 1998 - 2007 ( W3C ) MIT , ERCIM , Keio University
See tidy . h for the copyright notice .
2014-08-03 18:33:29 +00:00
2011-11-17 02:44:16 +00:00
*/
# include "tidy-int.h"
# include "lexer.h"
# include "parser.h"
# include "message.h"
# include "clean.h"
# include "tags.h"
# include "tmbstr.h"
2017-10-13 18:50:53 +00:00
# include "sprtf.h"
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/****************************************************************************/ /*
* * MARK : - Configuration Options
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Issue # 72 - Need to know to avoid error - reporting - no warning only if
* - - show - body - only yes .
* Issue # 132 - Likewise avoid warning if showing body only .
*/
# define showingBodyOnly(doc) (cfgAutoBool(doc,TidyBodyOnly) == TidyYesState) ? yes : no
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/****************************************************************************/ /*
* * MARK : - Node Operations
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* Generalised search for duplicate elements .
* Issue # 166 - repeated < main > element .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static Bool findNodeWithId ( Node * node , TidyTagId tid )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
Node * content ;
while ( node )
{
if ( TagIsId ( node , tid ) )
return yes ;
/*\
* Issue # 459 - Under certain circumstances , with many node this use of
* ' for ( content = node - > content ; content ; content = content - > content ) '
* would produce a * * forever * * circle , or at least a very extended loop . . .
* It is sufficient to test the content , if it exists ,
* to quickly iterate all nodes . Now all nodes are tested only once .
\ */
content = node - > content ;
if ( content )
{
if ( findNodeWithId ( content , tid ) )
return yes ;
}
node = node - > next ;
}
return no ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/**
* Perform a global search for an element .
* Issue # 166 - repeated < main > element
*/
static Bool findNodeById ( TidyDocImpl * doc , TidyTagId tid )
{
Node * node = ( doc ? doc - > root . content : NULL ) ;
return findNodeWithId ( node , tid ) ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
/**
* Inserts node into element at an appropriate location based
* on the type of node being inserted .
*/
static Bool InsertMisc ( Node * element , Node * node )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
if ( node - > type = = CommentTag | |
node - > type = = ProcInsTag | |
node - > type = = CDATATag | |
node - > type = = SectionTag | |
node - > type = = AspTag | |
node - > type = = JsteTag | |
node - > type = = PhpTag )
{
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
return yes ;
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
if ( node - > type = = XmlDecl )
{
Node * root = element ;
while ( root & & root - > parent )
root = root - > parent ;
if ( root & & ! ( root - > content & & root - > content - > type = = XmlDecl ) )
{
TY_ ( InsertNodeAtStart ) ( root , node ) ;
return yes ;
}
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/* Declared empty tags seem to be slipping through
* * the cracks . This is an experiment to figure out
* * a decent place to pick them up .
*/
if ( node - > tag & &
TY_ ( nodeIsElement ) ( node ) & &
TY_ ( nodeCMIsEmpty ) ( node ) & & TagId ( node ) = = TidyTag_UNKNOWN & &
( node - > tag - > versions & VERS_PROPRIETARY ) ! = 0 )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
return yes ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
return no ;
2011-11-17 02:44:16 +00:00
}
2014-08-03 18:33:29 +00:00
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* Insert " node " into markup tree in place of " element "
* which is moved to become the child of the node
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void InsertNodeAsParent ( Node * element , Node * node )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
node - > content = element ;
node - > last = element ;
node - > parent = element - > parent ;
element - > parent = node ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > parent - > content = = element )
node - > parent - > content = node ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > parent - > last = = element )
node - > parent - > last = node ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
node - > prev = element - > prev ;
element - > prev = NULL ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > prev )
node - > prev - > next = node ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
node - > next = element - > next ;
element - > next = NULL ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > next )
node - > next - > prev = node ;
2021-07-28 23:45:57 +00:00
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Unexpected content in table row is moved to just before the table in
* in accordance with Netscape and IE . This code assumes that node hasn ' t
* been inserted into the row .
*/
static void MoveBeforeTable ( TidyDocImpl * ARG_UNUSED ( doc ) , Node * row ,
Node * node )
{
Node * table ;
/* first find the table element */
for ( table = row - > parent ; table ; table = table - > parent )
{
if ( nodeIsTABLE ( table ) )
{
TY_ ( InsertNodeBeforeElement ) ( table , node ) ;
return ;
}
}
/* No table element */
TY_ ( InsertNodeBeforeElement ) ( row - > parent , node ) ;
}
/**
2021-08-05 12:18:30 +00:00
* Moves given node to end of body element .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void MoveNodeToBody ( TidyDocImpl * doc , Node * node )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
Node * body = TY_ ( FindBody ) ( doc ) ;
if ( body )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( RemoveNode ) ( node ) ;
TY_ ( InsertNodeAtEnd ) ( body , node ) ;
2021-07-28 23:45:57 +00:00
}
}
/**
2021-08-05 12:18:30 +00:00
* Move node to the head , where element is used as starting
* point in hunt for head . Normally called during parsing .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void MoveToHead ( TidyDocImpl * doc , Node * element , Node * node )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
Node * head = NULL ;
TY_ ( RemoveNode ) ( node ) ; /* make sure that node is isolated */
if ( TY_ ( nodeIsElement ) ( node ) )
{
TY_ ( Report ) ( doc , element , node , TAG_NOT_ALLOWED_IN ) ;
head = TY_ ( FindHEAD ) ( doc ) ;
assert ( head ! = NULL ) ;
TY_ ( InsertNodeAtEnd ) ( head , node ) ;
if ( node - > tag - > parser )
{
/* Only one of the existing test cases as of 2021-08-14 invoke
MoveToHead , and it doesn ' t go deeper than one level . The
parser ( ) call is supposed to return a node if additional
parsing is needed . Keep this in mind if we start to get bug
reports .
*/
Parser * parser = node - > tag - > parser ;
parser ( doc , node , IgnoreWhitespace ) ;
}
}
else
{
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
}
2021-07-28 23:45:57 +00:00
}
/***** **********************************************************************/ /*
* * MARK : - Decision Making
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* Indicates whether or not element can be pruned based on content ,
* user settings , etc .
*/
static Bool CanPrune ( TidyDocImpl * doc , Node * element )
{
if ( ! cfgBool ( doc , TidyDropEmptyElems ) )
return no ;
if ( TY_ ( nodeIsText ) ( element ) )
return yes ;
if ( element - > content )
return no ;
if ( element - > tag = = NULL )
return no ;
if ( element - > tag - > model & CM_BLOCK & & element - > attributes ! = NULL )
return no ;
if ( nodeIsA ( element ) & & element - > attributes ! = NULL )
return no ;
if ( nodeIsP ( element ) & & ! cfgBool ( doc , TidyDropEmptyParas ) )
return no ;
if ( element - > tag - > model & CM_ROW )
return no ;
if ( element - > tag - > model & CM_EMPTY )
return no ;
if ( nodeIsAPPLET ( element ) )
return no ;
if ( nodeIsOBJECT ( element ) )
return no ;
2011-11-17 02:44:16 +00:00
if ( nodeIsSCRIPT ( element ) & & attrGetSRC ( element ) )
return no ;
if ( nodeIsTITLE ( element ) )
return no ;
/* #433359 - fix by Randy Waki 12 Mar 01 */
if ( nodeIsIFRAME ( element ) )
return no ;
/* fix for bug 770297 */
if ( nodeIsTEXTAREA ( element ) )
return no ;
2014-08-03 18:33:29 +00:00
/* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */
if ( nodeIsCANVAS ( element ) )
return no ;
if ( nodeIsPROGRESS ( element ) )
return no ;
2011-11-17 02:44:16 +00:00
if ( attrGetID ( element ) | | attrGetNAME ( element ) )
return no ;
/* fix for bug 695408; a better fix would look for unknown and */
/* known proprietary attributes that make the element significant */
if ( attrGetDATAFLD ( element ) )
return no ;
/* fix for bug 723772, don't trim new-...-tags */
if ( element - > tag - > id = = TidyTag_UNKNOWN )
return no ;
if ( nodeIsBODY ( element ) )
return no ;
if ( nodeIsCOLGROUP ( element ) )
return no ;
2014-08-03 18:33:29 +00:00
/* HTML5 - do NOT drop empty option if it has attributes */
if ( nodeIsOPTION ( element ) & & element - > attributes ! = NULL )
return no ;
2015-02-01 10:46:31 +00:00
/* fix for #103 - don't drop empty dd tags lest document not validate */
if ( nodeIsDD ( element ) )
return no ;
2011-11-17 02:44:16 +00:00
return yes ;
}
2021-07-28 23:45:57 +00:00
/**
* Indicates whether or not node is a descendant of a tag of the given tid .
*/
static Bool DescendantOf ( Node * element , TidyTagId tid )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
Node * parent ;
for ( parent = element - > parent ;
parent ! = NULL ;
parent = parent - > parent )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
if ( TagIsId ( parent , tid ) )
return yes ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
return no ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
/**
* Indicates whether or not node is a descendant of a pre tag .
*/
2011-11-17 02:44:16 +00:00
static Bool IsPreDescendant ( Node * node )
{
Node * parent = node - > parent ;
while ( parent )
{
if ( parent - > tag & & parent - > tag - > parser = = TY_ ( ParsePre ) )
return yes ;
parent = parent - > parent ;
}
return no ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/**
* Indicates whether or not the only content model for the given node
* is CM_INLINE .
*/
static Bool nodeCMIsOnlyInline ( Node * node )
{
return TY_ ( nodeHasCM ) ( node , CM_INLINE ) & & ! TY_ ( nodeHasCM ) ( node , CM_BLOCK ) ;
}
/**
* Indicates whether or not the content of the given node is acceptable
* content for pre elements
*/
static Bool PreContent ( TidyDocImpl * ARG_UNUSED ( doc ) , Node * node )
{
/* p is coerced to br's, Text OK too */
if ( nodeIsP ( node ) | | TY_ ( nodeIsText ) ( node ) )
return yes ;
if ( node - > tag = = NULL | |
nodeIsPARAM ( node ) | |
! TY_ ( nodeHasCM ) ( node , CM_INLINE | CM_NEW ) )
return no ;
return yes ;
}
/**
* Indicates whether or not leading whitespace should be cleaned .
*/
static Bool CleanLeadingWhitespace ( TidyDocImpl * ARG_UNUSED ( doc ) , Node * node )
{
if ( ! TY_ ( nodeIsText ) ( node ) )
return no ;
if ( node - > parent - > type = = DocTypeTag )
return no ;
if ( IsPreDescendant ( node ) )
return no ;
if ( node - > parent - > tag & & node - > parent - > tag - > parser = = TY_ ( ParseScript ) )
return no ;
/* <p>...<br> <em>...</em>...</p> */
if ( nodeIsBR ( node - > prev ) )
return yes ;
/* <p> ...</p> */
if ( node - > prev = = NULL & & ! TY_ ( nodeHasCM ) ( node - > parent , CM_INLINE ) )
return yes ;
/* <h4>...</h4> <em>...</em> */
if ( node - > prev & & ! TY_ ( nodeHasCM ) ( node - > prev , CM_INLINE ) & &
TY_ ( nodeIsElement ) ( node - > prev ) )
return yes ;
/* <p><span> ...</span></p> */
if ( ! node - > prev & & ! node - > parent - > prev & & ! TY_ ( nodeHasCM ) ( node - > parent - > parent , CM_INLINE ) )
return yes ;
return no ;
}
2021-07-28 23:45:57 +00:00
/**
* Indicates whether or not trailing whitespace should be cleaned .
*/
2011-11-17 02:44:16 +00:00
static Bool CleanTrailingWhitespace ( TidyDocImpl * doc , Node * node )
{
Node * next ;
if ( ! TY_ ( nodeIsText ) ( node ) )
return no ;
if ( node - > parent - > type = = DocTypeTag )
return no ;
if ( IsPreDescendant ( node ) )
return no ;
if ( node - > parent - > tag & & node - > parent - > tag - > parser = = TY_ ( ParseScript ) )
return no ;
next = node - > next ;
/* <p>... </p> */
if ( ! next & & ! TY_ ( nodeHasCM ) ( node - > parent , CM_INLINE ) )
return yes ;
/* <div><small>... </small><h3>...</h3></div> */
if ( ! next & & node - > parent - > next & & ! TY_ ( nodeHasCM ) ( node - > parent - > next , CM_INLINE ) )
return yes ;
if ( ! next )
return no ;
if ( nodeIsBR ( next ) )
return yes ;
if ( TY_ ( nodeHasCM ) ( next , CM_INLINE ) )
return no ;
/* <a href='/'>...</a> <p>...</p> */
if ( next - > type = = StartTag )
return yes ;
/* <strong>...</strong> <hr /> */
if ( next - > type = = StartEndTag )
return yes ;
/* evil adjacent text nodes, Tidy should not generate these :-( */
if ( TY_ ( nodeIsText ) ( next ) & & next - > start < next - > end
& & TY_ ( IsWhite ) ( doc - > lexer - > lexbuf [ next - > start ] ) )
return yes ;
return no ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/***** **********************************************************************/ /*
* * MARK : - Information Accumulation
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* Errors in positioning of form start or end tags
* generally require human intervention to fix .
* Issue # 166 - repeated < main > element also uses this flag
* to indicate duplicates , discarded .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void BadForm ( TidyDocImpl * doc )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
doc - > badForm | = flg_BadForm ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/***** **********************************************************************/ /*
* * MARK : - Fixes and Touchup
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* Adds style information as a class in the document or a property
* of the node to prevent indentation of inferred UL tags .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void AddClassNoIndent ( TidyDocImpl * doc , Node * node )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
ctmbstr sprop =
" padding-left: 2ex; margin-left: 0ex "
" ; margin-top: 0ex; margin-bottom: 0ex " ;
if ( ! cfgBool ( doc , TidyDecorateInferredUL ) )
return ;
if ( cfgBool ( doc , TidyMakeClean ) )
TY_ ( AddStyleAsClass ) ( doc , node , sprop ) ;
else
TY_ ( AddStyleProperty ) ( doc , node , sprop ) ;
2021-07-28 23:45:57 +00:00
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* Cleans whitespace from text nodes , and drops such nodes if emptied
* completely as a result .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void CleanSpaces ( TidyDocImpl * doc , Node * node )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
Stack * stack = TY_ ( newStack ) ( doc , 16 ) ;
Node * next ;
while ( node )
{
next = node - > next ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( node ) & & CleanLeadingWhitespace ( doc , node ) )
while ( node - > start < node - > end & & TY_ ( IsWhite ) ( doc - > lexer - > lexbuf [ node - > start ] ) )
+ + ( node - > start ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( node ) & & CleanTrailingWhitespace ( doc , node ) )
while ( node - > end > node - > start & & TY_ ( IsWhite ) ( doc - > lexer - > lexbuf [ node - > end - 1 ] ) )
- - ( node - > end ) ;
if ( TY_ ( nodeIsText ) ( node ) & & ! ( node - > start < node - > end ) )
{
TY_ ( RemoveNode ) ( node ) ;
TY_ ( FreeNode ) ( doc , node ) ;
node = next ? next : TY_ ( pop ) ( stack ) ;
continue ;
}
if ( node - > content )
{
TY_ ( push ) ( stack , next ) ;
node = node - > content ;
continue ;
}
node = next ? next : TY_ ( pop ) ( stack ) ;
}
TY_ ( freeStack ) ( stack ) ;
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* If a table row is empty then insert an empty cell . This practice is
* consistent with browser behavior and avoids potential problems with
* row spanning cells .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void FixEmptyRow ( TidyDocImpl * doc , Node * row )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
Node * cell ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( row - > content = = NULL )
{
cell = TY_ ( InferredTag ) ( doc , TidyTag_TD ) ;
TY_ ( InsertNodeAtEnd ) ( row , cell ) ;
TY_ ( Report ) ( doc , row , cell , MISSING_STARTTAG ) ;
}
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* The doctype has been found after other tags ,
* and needs moving to before the html element
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void InsertDocType ( TidyDocImpl * doc , Node * element , Node * doctype )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
Node * existing = TY_ ( FindDocType ) ( doc ) ;
if ( existing )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , element , doctype , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , doctype ) ;
}
else
{
TY_ ( Report ) ( doc , element , doctype , DOCTYPE_AFTER_TAGS ) ;
while ( ! nodeIsHTML ( element ) )
element = element - > parent ;
TY_ ( InsertNodeBeforeElement ) ( element , doctype ) ;
2011-11-17 02:44:16 +00:00
}
}
2021-07-28 23:45:57 +00:00
/**
* This maps
* < p > hello < em > world < / em >
* to
* < p > hello < em > world < / em >
*
* Trims initial space , by moving it before the
* start tag , or if this element is the first in
* parent ' s content , then by discarding the space
*/
static void TrimInitialSpace ( TidyDocImpl * doc , Node * element , Node * text )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
Lexer * lexer = doc - > lexer ;
Node * prev , * node ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( text ) & &
lexer - > lexbuf [ text - > start ] = = ' ' & &
2021-07-28 23:45:57 +00:00
text - > start < text - > end )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
if ( ( element - > tag - > model & CM_INLINE ) & &
! ( element - > tag - > model & CM_FIELD ) )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
prev = element - > prev ;
if ( TY_ ( nodeIsText ) ( prev ) )
{
if ( prev - > end = = 0 | | lexer - > lexbuf [ prev - > end - 1 ] ! = ' ' )
lexer - > lexbuf [ ( prev - > end ) + + ] = ' ' ;
+ + ( element - > start ) ;
}
else /* create new node */
{
node = TY_ ( NewNode ) ( lexer - > allocator , lexer ) ;
node - > start = ( element - > start ) + + ;
node - > end = element - > start ;
lexer - > lexbuf [ node - > start ] = ' ' ;
TY_ ( InsertNodeBeforeElement ) ( element , node ) ;
DEBUG_LOG ( SPRTF ( " TrimInitialSpace: Created text node, inserted before <%s> \n " ,
( element - > element ? element - > element : " unknown " ) ) ) ;
}
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
/* discard the space in current node */
+ + ( text - > start ) ;
2011-11-17 02:44:16 +00:00
}
}
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* This maps
* < em > hello < / em > < strong > world < / strong >
* to
* < em > hello < / em > < strong > world < / strong >
*
* If last child of element is a text node
* then trim trailing white space character
* moving it to after element ' s end tag .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void TrimTrailingSpace ( TidyDocImpl * doc , Node * element , Node * last )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
Lexer * lexer = doc - > lexer ;
byte c ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( last ) )
{
if ( last - > end > last - > start )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
c = ( byte ) lexer - > lexbuf [ last - > end - 1 ] ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( c = = ' ' )
{
last - > end - = 1 ;
if ( ( element - > tag - > model & CM_INLINE ) & &
! ( element - > tag - > model & CM_FIELD ) )
lexer - > insertspace = yes ;
}
2021-07-28 23:45:57 +00:00
}
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Move initial and trailing space out .
* This routine maps :
* hello < em > world < / em >
* to
* hello < em > world < / em >
* and
* < em > hello < / em > < strong > world < / strong >
* to
* < em > hello < / em > < strong > world < / strong >
*/
static void TrimSpaces ( TidyDocImpl * doc , Node * element )
{
Node * text = element - > content ;
if ( nodeIsPRE ( element ) | | IsPreDescendant ( element ) )
2011-11-17 02:44:16 +00:00
return ;
2021-07-28 23:45:57 +00:00
if ( TY_ ( nodeIsText ) ( text ) )
TrimInitialSpace ( doc , element , text ) ;
text = element - > last ;
2015-02-05 11:21:08 +00:00
2021-07-28 23:45:57 +00:00
if ( TY_ ( nodeIsText ) ( text ) )
TrimTrailingSpace ( doc , element , text ) ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/***************************************************************************/ /*
* * MARK : - Parsers Support
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* Structure used by FindDescendant_cb .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
struct MatchingDescendantData
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
Node * found_node ;
Bool * passed_marker_node ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* input: */
TidyTagId matching_tagId ;
Node * node_to_find ;
Node * marker_node ;
} ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
2021-08-05 12:18:30 +00:00
* The main engine for FindMatchingDescendant .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static NodeTraversalSignal FindDescendant_cb ( TidyDocImpl * ARG_UNUSED ( doc ) , Node * node , void * propagate )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
struct MatchingDescendantData * cb_data = ( struct MatchingDescendantData * ) propagate ;
2021-07-28 23:45:57 +00:00
if ( TagId ( node ) = = cb_data - > matching_tagId )
{
/* make sure we match up 'unknown' tags exactly! */
if ( cb_data - > matching_tagId ! = TidyTag_UNKNOWN | |
( node - > element ! = NULL & &
cb_data - > node_to_find ! = NULL & &
cb_data - > node_to_find - > element ! = NULL & &
0 = = TY_ ( tmbstrcmp ) ( cb_data - > node_to_find - > element , node - > element ) ) )
{
cb_data - > found_node = node ;
return ExitTraversal ;
}
}
if ( cb_data - > passed_marker_node & & node = = cb_data - > marker_node )
* cb_data - > passed_marker_node = yes ;
return VisitParent ;
}
/**
* Search the parent chain ( from ` parent ` upwards up to the root ) for a node
* matching the given ' node ' .
*
* When the search passes beyond the ` marker_node ` ( which is assumed to sit
* in the parent chain ) , this will be flagged by setting the boolean
* referenced by ` is_parent_of_marker ` to ` yes ` .
*
* ' is_parent_of_marker ' and ' marker_node ' are optional parameters and may
* be NULL .
*/
static Node * FindMatchingDescendant ( Node * parent , Node * node , Node * marker_node , Bool * is_parent_of_marker )
{
struct MatchingDescendantData cb_data = { 0 } ;
cb_data . matching_tagId = TagId ( node ) ;
cb_data . node_to_find = node ;
cb_data . marker_node = marker_node ;
assert ( node ) ;
if ( is_parent_of_marker )
* is_parent_of_marker = no ;
TY_ ( TraverseNodeTree ) ( NULL , parent , FindDescendant_cb , & cb_data ) ;
return cb_data . found_node ;
}
/**
* Finds the last list item for the given list , providing it in the
* in - out parameter . Returns yes or no if the item was the last list
* item .
*/
static Bool FindLastLI ( Node * list , Node * * lastli )
{
Node * node ;
* lastli = NULL ;
for ( node = list - > content ; node ; node = node - > next )
if ( nodeIsLI ( node ) & & node - > type = = StartTag )
* lastli = node ;
return * lastli ? yes : no ;
}
/***************************************************************************/ /*
* * MARK : - Parser Stack
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* Allocates and initializes the parser ' s stack .
*/
void TY_ ( InitParserStack ) ( TidyDocImpl * doc )
{
2021-08-05 12:18:30 +00:00
enum { default_size = 32 } ;
2021-07-28 23:45:57 +00:00
TidyParserMemory * content = ( TidyParserMemory * ) TidyAlloc ( doc - > allocator , sizeof ( TidyParserMemory ) * default_size ) ;
doc - > stack . content = content ;
doc - > stack . size = default_size ;
doc - > stack . top = - 1 ;
}
/**
* Frees the parser ' s stack when done .
*/
void TY_ ( FreeParserStack ) ( TidyDocImpl * doc )
{
2021-08-05 12:18:30 +00:00
TidyFree ( doc - > allocator , doc - > stack . content ) ;
2021-07-28 23:45:57 +00:00
doc - > stack . content = NULL ;
doc - > stack . size = 0 ;
doc - > stack . top = - 1 ;
}
/**
* Increase the stack size .
*/
static void growParserStack ( TidyDocImpl * doc )
{
TidyParserMemory * content ;
2021-08-05 12:18:30 +00:00
content = ( TidyParserMemory * ) TidyAlloc ( doc - > allocator , sizeof ( TidyParserMemory ) * doc - > stack . size * 2 ) ;
2021-07-28 23:45:57 +00:00
memcpy ( content , doc - > stack . content , sizeof ( TidyParserMemory ) * ( doc - > stack . top + 1 ) ) ;
2021-08-05 12:18:30 +00:00
TidyFree ( doc - > allocator , doc - > stack . content ) ;
2021-07-28 23:45:57 +00:00
doc - > stack . content = content ;
doc - > stack . size = doc - > stack . size * 2 ;
}
/**
* Indicates whether or not the stack is empty .
*/
2021-08-05 12:18:30 +00:00
static inline Bool isEmptyParserStack ( TidyDocImpl * doc )
2021-07-28 23:45:57 +00:00
{
return doc - > stack . top < 0 ;
}
/**
* Peek at the parser memory .
*/
2021-08-05 12:18:30 +00:00
static inline FUNC_UNUSED TidyParserMemory peekMemory ( TidyDocImpl * doc )
2021-07-28 23:45:57 +00:00
{
return doc - > stack . content [ doc - > stack . top ] ;
}
/**
2021-08-05 12:18:30 +00:00
* Peek at the parser memory " identity " field . This is just a convenience
2021-07-28 23:45:57 +00:00
* to avoid having to create a new struct instance in the caller .
*/
2021-08-05 12:18:30 +00:00
static inline Parser * peekMemoryIdentity ( TidyDocImpl * doc )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
return doc - > stack . content [ doc - > stack . top ] . identity ;
2021-07-28 23:45:57 +00:00
}
/**
2021-08-05 12:18:30 +00:00
* Peek at the parser memory " mode " field . This is just a convenience
2021-07-28 23:45:57 +00:00
* to avoid having to create a new struct instance in the caller .
*/
2021-08-05 12:18:30 +00:00
static GetTokenMode inline peekMemoryMode ( TidyDocImpl * doc )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
return doc - > stack . content [ doc - > stack . top ] . mode ;
2021-07-28 23:45:57 +00:00
}
/**
* Pop out a parser memory .
*/
static TidyParserMemory popMemory ( TidyDocImpl * doc )
{
if ( ! isEmptyParserStack ( doc ) )
{
TidyParserMemory data = doc - > stack . content [ doc - > stack . top ] ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " \n <--POP %s pointed to is %p, \t memory is %p (size is %lu), depth is %i \n " , data . reentry_node ? data . reentry_node - > element : NULL , data . reentry_node , & doc - > stack . content [ doc - > stack . top ] , sizeof ( TidyParserMemory ) , doc - > stack . top - 1 ) ) ;
2021-07-28 23:45:57 +00:00
doc - > stack . top = doc - > stack . top - 1 ;
return data ;
}
TidyParserMemory blank = { NULL } ;
return blank ;
}
2021-08-05 12:18:30 +00:00
/**
* Push the parser memory to the stack .
*/
static void pushMemory ( TidyDocImpl * doc , TidyParserMemory data )
{
if ( doc - > stack . top = = doc - > stack . size - 1 )
growParserStack ( doc ) ;
doc - > stack . top + + ;
doc - > stack . content [ doc - > stack . top ] = data ;
DEBUG_LOG ( SPRTF ( " \n -->PUSH %s pointed to is %p, \t memory is %p (size is %lu), depth is %i \n " , data . reentry_node ? data . reentry_node - > element : NULL , data . reentry_node , & doc - > stack . content [ doc - > stack . top ] , sizeof ( TidyParserMemory ) , doc - > stack . top ) ) ;
}
2021-07-28 23:45:57 +00:00
/***************************************************************************/ /*
* * MARK : - Parser Search and Instantiation
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* Retrieves the correct parser for the given node , accounting for various
* conditions , and readies the lexer for parsing that node .
*/
static Parser * GetParserForNode ( TidyDocImpl * doc , Node * node )
{
Lexer * lexer = doc - > lexer ;
/* [i_a]2 prevent crash for active content (php, asp) docs */
2021-08-05 12:18:30 +00:00
if ( ! node | | node - > tag = = NULL )
2021-07-28 23:45:57 +00:00
return NULL ;
/*
Fix by GLP 2000 - 12 - 21. Need to reset insertspace if this is both
a non - inline and empty tag ( base , link , meta , isindex , hr , area ) .
*/
if ( node - > tag - > model & CM_EMPTY )
{
lexer - > waswhite = no ;
if ( node - > tag - > parser = = NULL )
return NULL ;
}
else if ( ! ( node - > tag - > model & CM_INLINE ) )
lexer - > insertspace = no ;
if ( node - > tag - > parser = = NULL )
return NULL ;
if ( node - > type = = StartEndTag )
return NULL ;
/* [i_a]2 added this - not sure why - CHECKME: */
lexer - > parent = node ;
return ( node - > tag - > parser ) ;
}
/**
2021-08-05 12:18:30 +00:00
* This parser controller initiates the parsing process with the document ' s
* root starting with the provided node , which should be the HTML node after
* the pre - HTML stuff is handled at a higher level .
2021-07-28 23:45:57 +00:00
*
2021-08-05 12:18:30 +00:00
* This controller is responsible for calling each of the individual parsers ,
* based on the tokens it pulls from the lexer , or the tokens passed back via
* the parserMemory stack from each of the parsers . Having a main , central
2021-08-15 15:17:50 +00:00
* looping dispatcher in this fashion allows the prevention of recursion .
2021-07-28 23:45:57 +00:00
*/
void ParseHTMLWithNode ( TidyDocImpl * doc , Node * node )
{
GetTokenMode mode = IgnoreWhitespace ;
2021-08-05 12:18:30 +00:00
Parser * parser = GetParserForNode ( doc , node ) ;
Bool something_to_do = yes ;
2021-07-28 23:45:57 +00:00
/*
This main loop is only extinguished when all of the parser tokens are
2021-08-05 12:18:30 +00:00
consumed . Ideally , EVERY parser will return nodes to this loop for
dispatch to the appropriate parser , but some of the recursive parsers
still consume some tokens on their own .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
while ( something_to_do )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
node = parser ? parser ( doc , node , mode ) : NULL ;
/*
We have a node , so anything deferred was already pushed to the stack
to be dealt with later .
*/
if ( node )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
parser = GetParserForNode ( doc , node ) ;
continue ;
2021-07-28 23:45:57 +00:00
}
/*
2021-08-05 12:18:30 +00:00
We weren ' t given a node , which means this particular leaf is bottomed
out . We ' ll re - enter the parsers using information from the stack .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
if ( ! isEmptyParserStack ( doc ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
parser = peekMemoryIdentity ( doc ) ;
if ( parser )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
continue ;
}
else
{
/* No parser means we're only passing back a parsing mode. */
mode = peekMemoryMode ( doc ) ;
2021-07-28 23:45:57 +00:00
popMemory ( doc ) ;
}
}
2021-08-05 12:18:30 +00:00
2021-07-28 23:45:57 +00:00
/*
2021-08-05 12:18:30 +00:00
At this point , there ' s nothing being returned from parsers , and
nothing on the stack , so we can draw a new node from the lexer .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
node = TY_ ( GetToken ) ( doc , mode ) ;
DEBUG_LOG ( SPRTF ( " ---ParseHTMLWithNode got token %s with mode %u. \n " , node ? node - > element : NULL , mode ) ) ;
if ( node )
parser = GetParserForNode ( doc , node ) ;
else
something_to_do = no ;
2021-07-28 23:45:57 +00:00
}
}
/***************************************************************************/ /*
2021-08-05 12:18:30 +00:00
* * MARK : - Parsers
2021-07-28 23:45:57 +00:00
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseBlock)
2021-07-28 23:45:57 +00:00
* ` element ` is a node created by the lexer upon seeing the start tag , or
* by the parser when the start tag is inferred
2021-08-05 12:18:30 +00:00
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseBlock ) ( TidyDocImpl * doc , Node * element , GetTokenMode mode )
2021-07-28 23:45:57 +00:00
{
# if defined(ENABLE_DEBUG_LOG)
static int in_parse_block = 0 ;
static int parse_block_cnt = 0 ;
# endif
Lexer * lexer = doc - > lexer ;
Node * node ;
Bool checkstack = yes ;
uint istackbase = 0 ;
2021-08-05 12:18:30 +00:00
if ( element = = NULL )
{
TidyParserMemory memory = popMemory ( doc ) ;
node = memory . reentry_node ; /* Throwaway, because the loop overwrites this immediately. */
mode = memory . reentry_mode ;
element = memory . original_node ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseBlock with %s \n " , node - > element ) ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
else
{
DEBUG_LOG ( SPRTF ( " >>>Entering ParseBlock %d... %d %s \n " , + + in_parse_block , + + parse_block_cnt ,
( ( element & & element - > element ) ? element - > element : " " ) ) ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( element - > tag - > model & CM_EMPTY )
{
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 1 %d... \n " , - - in_parse_block ) ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( nodeIsFORM ( element ) & &
DescendantOf ( element , TidyTag_FORM ) )
TY_ ( Report ) ( doc , element , NULL , ILLEGAL_NESTING ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/*
InlineDup ( ) asks the lexer to insert inline emphasis tags
currently pushed on the istack , but take care to avoid
propagating inline emphasis inside OBJECT or APPLET .
For these elements a fresh inline stack context is created
and disposed of upon reaching the end of the element .
They thus behave like table cells in this respect .
*/
if ( element - > tag - > model & CM_OBJECT )
{
istackbase = lexer - > istackbase ;
lexer - > istackbase = lexer - > istacksize ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( element - > tag - > model & CM_MIXED ) )
TY_ ( InlineDup ) ( doc , NULL ) ;
/*\
* Issue # 212 - If it is likely that it may be necessary
* to move a leading space into a text node before this
* element , then keep the mode MixedContent to keep any
* leading space
\ */
if ( ! ( element - > tag - > model & CM_INLINE ) | |
( element - > tag - > model & CM_FIELD ) )
{
mode = IgnoreWhitespace ;
}
else if ( mode = = IgnoreWhitespace )
{
/* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace'
when such a leading space may need to be inserted before this element to
preverve the browser view */
mode = MixedContent ;
}
} /* Re-Entering */
/*
Main Loop
*/
2021-07-28 23:45:57 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , mode /*MixedContent*/ ) ) ! = NULL )
{
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " ---ParseBlock got token %s with mode %u \n " , node - > element , IgnoreWhitespace ) ) ;
2021-07-28 23:45:57 +00:00
/* end tag for this element */
if ( node - > type = = EndTag & & node - > tag & &
2011-11-17 02:44:16 +00:00
( node - > tag = = element - > tag | | element - > was = = node - > tag ) )
{
TY_ ( FreeNode ) ( doc , node ) ;
if ( element - > tag - > model & CM_OBJECT )
{
/* pop inline stack */
while ( lexer - > istacksize > lexer - > istackbase )
TY_ ( PopInline ) ( doc , NULL ) ;
lexer - > istackbase = istackbase ;
}
element - > closed = yes ;
TrimSpaces ( doc , element ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 2 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
if ( nodeIsHTML ( node ) | | nodeIsHEAD ( node ) | | nodeIsBODY ( node ) )
{
if ( TY_ ( nodeIsElement ) ( node ) )
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
if ( node - > type = = EndTag )
{
if ( node - > tag = = NULL )
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
else if ( nodeIsBR ( node ) )
node - > type = StartTag ;
else if ( nodeIsP ( node ) )
{
/* Cannot have a block inside a paragraph, so no checking
for an ancestor is necessary - - but we _can_ have
paragraphs inside a block , so change it to an implicit
empty paragraph , to be dealt with according to the user ' s
options
*/
node - > type = StartEndTag ;
node - > implicit = yes ;
}
else if ( DescendantOf ( element , node - > tag - > id ) )
{
2021-07-28 23:45:57 +00:00
/*
2011-11-17 02:44:16 +00:00
if this is the end tag for an ancestor element
then infer end tag for this element
*/
TY_ ( UngetToken ) ( doc ) ;
break ;
}
else
{
/* special case </tr> etc. for stuff moved in front of table */
if ( lexer - > exiled
& & ( TY_ ( nodeHasCM ) ( node , CM_TABLE ) | | nodeIsTABLE ( node ) ) )
{
TY_ ( UngetToken ) ( doc ) ;
TrimSpaces ( doc , element ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 2 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
}
}
/* mixed content model permits text */
if ( TY_ ( nodeIsText ) ( node ) )
{
if ( checkstack )
{
checkstack = no ;
if ( ! ( element - > tag - > model & CM_MIXED ) )
{
if ( TY_ ( InlineDup ) ( doc , node ) > 0 )
continue ;
}
}
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
mode = MixedContent ;
/*
HTML4 strict doesn ' t allow mixed content for
elements with % block ; as their content model
*/
/*
But only body , map , blockquote , form and
noscript have content model % block ;
*/
if ( nodeIsBODY ( element ) | |
nodeIsMAP ( element ) | |
nodeIsBLOCKQUOTE ( element ) | |
nodeIsFORM ( element ) | |
nodeIsNOSCRIPT ( element ) )
TY_ ( ConstrainVersion ) ( doc , ~ VERS_HTML40_STRICT ) ;
continue ;
}
if ( InsertMisc ( element , node ) )
continue ;
/* allow PARAM elements? */
if ( nodeIsPARAM ( node ) )
{
if ( TY_ ( nodeHasCM ) ( element , CM_PARAM ) & & TY_ ( nodeIsElement ) ( node ) )
{
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
continue ;
}
/* otherwise discard it */
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/* allow AREA elements? */
if ( nodeIsAREA ( node ) )
{
if ( nodeIsMAP ( element ) & & TY_ ( nodeIsElement ) ( node ) )
{
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
continue ;
}
/* otherwise discard it */
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/* ignore unknown start/end tags */
if ( node - > tag = = NULL )
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/*
Allow CM_INLINE elements here .
Allow CM_BLOCK elements here unless
lexer - > excludeBlocks is yes .
LI and DD are special cased .
Otherwise infer end tag for this element .
*/
if ( ! TY_ ( nodeHasCM ) ( node , CM_INLINE ) )
{
if ( ! TY_ ( nodeIsElement ) ( node ) )
{
if ( nodeIsFORM ( node ) )
BadForm ( doc ) ;
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/* #427671 - Fix by Randy Waki - 10 Aug 00 */
/*
If an LI contains an illegal FRAME , FRAMESET , OPTGROUP , or OPTION
start tag , discard the start tag and let the subsequent content get
parsed as content of the enclosing LI . This seems to mimic IE and
Netscape , and avoids an infinite loop : without this check ,
ParseBlock ( which is parsing the LI ' s content ) and ParseList ( which
is parsing the LI ' s parent ' s content ) repeatedly defer to each
other to parse the illegal start tag , each time inferring a missing
< / li > or < li > respectively .
NOTE : This check is a bit fragile . It specifically checks for the
four tags that happen to weave their way through the current series
of tests performed by ParseBlock and ParseList to trigger the
infinite loop .
*/
if ( nodeIsLI ( element ) )
{
if ( nodeIsFRAME ( node ) | |
nodeIsFRAMESET ( node ) | |
nodeIsOPTGROUP ( node ) | |
nodeIsOPTION ( node ) )
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ; /* DSR - 27Apr02 avoid memory leak */
continue ;
}
}
if ( nodeIsTD ( element ) | | nodeIsTH ( element ) )
{
/* if parent is a table cell, avoid inferring the end of the cell */
if ( TY_ ( nodeHasCM ) ( node , CM_HEAD ) )
{
MoveToHead ( doc , element , node ) ;
continue ;
}
if ( TY_ ( nodeHasCM ) ( node , CM_LIST ) )
{
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_UL ) ;
AddClassNoIndent ( doc , node ) ;
lexer - > excludeBlocks = yes ;
}
else if ( TY_ ( nodeHasCM ) ( node , CM_DEFLIST ) )
{
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_DL ) ;
lexer - > excludeBlocks = yes ;
}
/* infer end of current table cell */
if ( ! TY_ ( nodeHasCM ) ( node , CM_BLOCK ) )
{
TY_ ( UngetToken ) ( doc ) ;
TrimSpaces ( doc , element ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 3 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
}
else if ( TY_ ( nodeHasCM ) ( node , CM_BLOCK ) )
{
if ( lexer - > excludeBlocks )
{
if ( ! TY_ ( nodeHasCM ) ( element , CM_OPT ) )
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_BEFORE ) ;
2011-11-17 02:44:16 +00:00
TY_ ( UngetToken ) ( doc ) ;
if ( TY_ ( nodeHasCM ) ( element , CM_OBJECT ) )
lexer - > istackbase = istackbase ;
TrimSpaces ( doc , element ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 4 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
}
else /* things like list items */
{
if ( node - > tag - > model & CM_HEAD )
{
MoveToHead ( doc , element , node ) ;
continue ;
}
/*
special case where a form start tag
occurs in a tr and is followed by td or th
*/
if ( nodeIsFORM ( element ) & &
nodeIsTD ( element - > parent ) & &
element - > parent - > implicit )
{
if ( nodeIsTD ( node ) )
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
if ( nodeIsTH ( node ) )
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
node = element - > parent ;
TidyDocFree ( doc , node - > element ) ;
node - > element = TY_ ( tmbstrdup ) ( doc - > allocator , " th " ) ;
node - > tag = TY_ ( LookupTagDef ) ( TidyTag_TH ) ;
continue ;
}
}
if ( ! TY_ ( nodeHasCM ) ( element , CM_OPT ) & & ! element - > implicit )
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_BEFORE ) ;
2021-07-28 23:45:57 +00:00
2017-09-29 18:46:17 +00:00
/* #521, warn on missing optional end-tags if not omitting them. */
if ( cfgBool ( doc , TidyOmitOptionalTags ) = = no & & TY_ ( nodeHasCM ) ( element , CM_OPT ) )
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_OPTIONAL ) ;
2011-11-17 02:44:16 +00:00
TY_ ( UngetToken ) ( doc ) ;
if ( TY_ ( nodeHasCM ) ( node , CM_LIST ) )
{
if ( element - > parent & & element - > parent - > tag & &
element - > parent - > tag - > parser = = TY_ ( ParseList ) )
{
TrimSpaces ( doc , element ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 5 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
node = TY_ ( InferredTag ) ( doc , TidyTag_UL ) ;
AddClassNoIndent ( doc , node ) ;
}
else if ( TY_ ( nodeHasCM ) ( node , CM_DEFLIST ) )
{
if ( nodeIsDL ( element - > parent ) )
{
TrimSpaces ( doc , element ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 6 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
node = TY_ ( InferredTag ) ( doc , TidyTag_DL ) ;
}
else if ( TY_ ( nodeHasCM ) ( node , CM_TABLE ) | | TY_ ( nodeHasCM ) ( node , CM_ROW ) )
{
/* http://tidy.sf.net/issue/1316307 */
2021-07-28 23:45:57 +00:00
/* In exiled mode, return so table processing can
2011-11-17 02:44:16 +00:00
continue . */
2021-08-05 12:18:30 +00:00
if ( lexer - > exiled )
{
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 7 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2014-08-03 18:33:29 +00:00
}
2011-11-17 02:44:16 +00:00
node = TY_ ( InferredTag ) ( doc , TidyTag_TABLE ) ;
}
else if ( TY_ ( nodeHasCM ) ( element , CM_OBJECT ) )
{
/* pop inline stack */
while ( lexer - > istacksize > lexer - > istackbase )
TY_ ( PopInline ) ( doc , NULL ) ;
lexer - > istackbase = istackbase ;
TrimSpaces ( doc , element ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 8 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
else
{
TrimSpaces ( doc , element ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 9 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
}
}
2015-11-22 17:46:00 +00:00
/*\
* Issue # 307 - an < A > tag to ends any open < A > element
* Like # 427827 - fixed by Randy Waki and Bjoern Hoehrmann 23 Aug 00
* in ParseInline ( ) , fix copied HERE to ParseBlock ( )
2015-11-22 19:43:12 +00:00
* href : http : //www.w3.org/TR/html-markup/a.html
* The interactive element a must not appear as a descendant of the a element .
2015-11-22 17:46:00 +00:00
\ */
2021-07-28 23:45:57 +00:00
if ( nodeIsA ( node ) & & ! node - > implicit & &
2015-11-22 17:46:00 +00:00
( nodeIsA ( element ) | | DescendantOf ( element , TidyTag_A ) ) )
{
if ( node - > type ! = EndTag & & node - > attributes = = NULL
& & cfgBool ( doc , TidyCoerceEndTags ) )
{
node - > type = EndTag ;
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , COERCE_TO_ENDTAG ) ;
2015-11-22 17:46:00 +00:00
TY_ ( UngetToken ) ( doc ) ;
continue ;
}
2015-11-25 09:00:45 +00:00
if ( nodeIsA ( element ) )
{
2017-09-18 17:47:52 +00:00
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_BEFORE ) ;
2015-11-25 09:00:45 +00:00
TY_ ( UngetToken ) ( doc ) ;
}
2017-09-18 17:47:52 +00:00
else
{
/* Issue #597 - if we not 'UngetToken' then it is being discarded.
Add message , and ' FreeNode ' - thanks @ ralfjunker */
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
}
2015-11-22 17:46:00 +00:00
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 9b %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2015-11-22 17:46:00 +00:00
}
2011-11-17 02:44:16 +00:00
/* parse known element */
if ( TY_ ( nodeIsElement ) ( node ) )
{
if ( node - > tag - > model & CM_INLINE )
{
if ( checkstack & & ! node - > implicit )
{
checkstack = no ;
if ( ! ( element - > tag - > model & CM_MIXED ) ) /* #431731 - fix by Randy Waki 25 Dec 00 */
{
if ( TY_ ( InlineDup ) ( doc , node ) > 0 )
continue ;
}
}
mode = MixedContent ;
}
else
{
checkstack = yes ;
mode = IgnoreWhitespace ;
}
/* trim white space before <br> */
if ( nodeIsBR ( node ) )
TrimSpaces ( doc , element ) ;
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
2021-07-28 23:45:57 +00:00
2011-11-17 02:44:16 +00:00
if ( node - > implicit )
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , INSERTING_TAG ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an
2015-06-04 11:11:54 +00:00
effort has been made above to set a ' MixedContent ' mode in some cases ?
WHY IS THE ' mode ' VARIABLE NOT USED HERE ? ? ? ? */
2021-08-05 12:18:30 +00:00
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseBlock ) ;
memory . reentry_node = node ;
memory . reentry_mode = mode ;
memory . original_node = element ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Leave ParseBlock to return node %s \n " , node - > element ) ) ;
}
return node ;
2011-11-17 02:44:16 +00:00
}
/* discard unexpected tags */
if ( node - > type = = EndTag )
TY_ ( PopInline ) ( doc , node ) ; /* if inline end tag */
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
if ( ! ( element - > tag - > model & CM_OPT ) )
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_FOR ) ;
2011-11-17 02:44:16 +00:00
if ( element - > tag - > model & CM_OBJECT )
{
/* pop inline stack */
while ( lexer - > istacksize > lexer - > istackbase )
TY_ ( PopInline ) ( doc , NULL ) ;
lexer - > istackbase = istackbase ;
}
TrimSpaces ( doc , element ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBlock 10 %d... \n " , - - in_parse_block ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2015-02-02 16:25:49 +00:00
}
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseBody)
* Parses the ` body ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseBody ) ( TidyDocImpl * doc , Node * body , GetTokenMode mode )
2011-11-17 02:44:16 +00:00
{
Lexer * lexer = doc - > lexer ;
2021-08-05 12:18:30 +00:00
Node * node ;
Bool checkstack = no ;
Bool iswhitenode = no ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
mode = IgnoreWhitespace ;
checkstack = yes ;
2011-11-17 02:44:16 +00:00
/*
2021-08-05 12:18:30 +00:00
If we ' re re - entering , then we need to setup from a previous state ,
instead of starting fresh . We can pull what we need from the document ' s
stack .
*/
if ( body = = NULL )
{
TidyParserMemory memory = popMemory ( doc ) ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
body = memory . original_node ;
checkstack = memory . register_b_1 ;
iswhitenode = memory . register_b_2 ;
mode = memory . mode ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseBody with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Enter ParseBody... \n " ) ) ;
TY_ ( BumpObject ) ( doc , body - > parent ) ;
}
2011-11-17 02:44:16 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , mode ) ) ! = NULL )
{
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " ---ParseBody got token %s with mode %u \n " , node - > element , IgnoreWhitespace ) ) ;
/* find and discard multiple <body> elements */
if ( node - > tag = = body - > tag & & node - > type = = StartTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , body , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* #538536 Extra endtags not detected */
if ( nodeIsHTML ( node ) )
{
if ( TY_ ( nodeIsElement ) ( node ) | | lexer - > seenEndHtml )
TY_ ( Report ) ( doc , body , node , DISCARDING_UNEXPECTED ) ;
else
lexer - > seenEndHtml = 1 ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( lexer - > seenEndBody & &
( node - > type = = StartTag | |
node - > type = = EndTag | |
node - > type = = StartEndTag ) )
{
TY_ ( Report ) ( doc , body , node , CONTENT_AFTER_BODY ) ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > tag = = body - > tag & & node - > type = = EndTag )
{
body - > closed = yes ;
TrimSpaces ( doc , body ) ;
TY_ ( FreeNode ) ( doc , node ) ;
lexer - > seenEndBody = 1 ;
mode = IgnoreWhitespace ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( nodeIsNOFRAMES ( body - > parent ) )
break ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( nodeIsNOFRAMES ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > type = = StartTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
TY_ ( InsertNodeAtEnd ) ( body , node ) ;
memory . identity = TY_ ( ParseBody ) ;
memory . original_node = body ;
memory . reentry_node = node ;
memory . register_b_1 = checkstack ;
memory . register_b_2 = iswhitenode ;
memory . mode = mode ;
pushMemory ( doc , memory ) ;
return node ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( node - > type = = EndTag & & nodeIsNOFRAMES ( body - > parent ) )
2015-06-21 17:49:44 +00:00
{
2021-08-05 12:18:30 +00:00
TrimSpaces ( doc , body ) ;
TY_ ( UngetToken ) ( doc ) ;
break ;
2015-06-21 17:49:44 +00:00
}
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( ( nodeIsFRAME ( node ) | | nodeIsFRAMESET ( node ) )
& & nodeIsNOFRAMES ( body - > parent ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TrimSpaces ( doc , body ) ;
TY_ ( UngetToken ) ( doc ) ;
break ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
iswhitenode = no ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( node ) & &
node - > end < = node - > start + 1 & &
lexer - > lexbuf [ node - > start ] = = ' ' )
iswhitenode = yes ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( body , node ) )
2011-11-17 02:44:16 +00:00
continue ;
2021-08-05 12:18:30 +00:00
/* mixed content model permits text */
if ( TY_ ( nodeIsText ) ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( iswhitenode & & mode = = IgnoreWhitespace )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
/* HTML 2 and HTML4 strict don't allow text here */
TY_ ( ConstrainVersion ) ( doc , ~ ( VERS_HTML40_STRICT | VERS_HTML20 ) ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( checkstack )
{
checkstack = no ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( InlineDup ) ( doc , node ) > 0 )
continue ;
}
TY_ ( InsertNodeAtEnd ) ( body , node ) ;
mode = MixedContent ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
if ( node - > type = = DocTypeTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
InsertDocType ( doc , body , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
/* discard unknown and PARAM tags */
2011-11-17 02:44:16 +00:00
if ( node - > tag = = NULL | | nodeIsPARAM ( node ) )
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , body , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
/*
Netscape allows LI and DD directly in BODY
We infer UL or DL respectively and use this
Bool to exclude block - level elements so as
to match Netscape ' s observed behaviour .
*/
lexer - > excludeBlocks = no ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( ( nodeIsINPUT ( node ) | |
( ! TY_ ( nodeHasCM ) ( node , CM_BLOCK ) & & ! TY_ ( nodeHasCM ) ( node , CM_INLINE ) )
) & & ! TY_ ( IsHTML5Mode ) ( doc ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/* avoid this error message being issued twice */
if ( ! ( node - > tag - > model & CM_HEAD ) )
TY_ ( Report ) ( doc , body , node , TAG_NOT_ALLOWED_IN ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > tag - > model & CM_HTML )
{
/* copy body attributes if current body was inferred */
if ( nodeIsBODY ( node ) & & body - > implicit
& & body - > attributes = = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
body - > attributes = node - > attributes ;
node - > attributes = NULL ;
2011-11-17 02:44:16 +00:00
}
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-08-05 12:18:30 +00:00
if ( node - > tag - > model & CM_HEAD )
{
MoveToHead ( doc , body , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > tag - > model & CM_LIST )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_UL ) ;
AddClassNoIndent ( doc , node ) ;
lexer - > excludeBlocks = yes ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
else if ( node - > tag - > model & CM_DEFLIST )
2011-11-17 02:44:16 +00:00
{
TY_ ( UngetToken ) ( doc ) ;
2021-08-05 12:18:30 +00:00
node = TY_ ( InferredTag ) ( doc , TidyTag_DL ) ;
lexer - > excludeBlocks = yes ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
else if ( node - > tag - > model & ( CM_TABLE | CM_ROWGRP | CM_ROW ) )
{
/* http://tidy.sf.net/issue/2855621 */
if ( node - > type ! = EndTag ) {
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_TABLE ) ;
}
lexer - > excludeBlocks = yes ;
}
else if ( nodeIsINPUT ( node ) )
{
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_FORM ) ;
lexer - > excludeBlocks = yes ;
}
else
{
if ( ! TY_ ( nodeHasCM ) ( node , CM_ROW | CM_FIELD ) )
{
TY_ ( UngetToken ) ( doc ) ;
return NULL ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* ignore </td> </th> <option> etc. */
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( node - > type = = EndTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( nodeIsBR ( node ) )
node - > type = StartTag ;
else if ( nodeIsP ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
node - > type = StartEndTag ;
node - > implicit = yes ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
else if ( TY_ ( nodeHasCM ) ( node , CM_INLINE ) )
TY_ ( PopInline ) ( doc , node ) ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsElement ) ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( nodeIsMAIN ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/*\ Issue #166 - repeated <main> element
* How to efficiently search for a previous main element ?
\ */
if ( findNodeById ( doc , TidyTag_MAIN ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
doc - > badForm | = flg_BadMain ; /* this is an ERROR in format */
TY_ ( Report ) ( doc , body , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-08-05 12:18:30 +00:00
}
/* Issue #20 - merging from Ger Hobbelt fork put back CM_MIXED, which had been
removed to fix this issue - reverting to fix 880221 e
*/
if ( TY_ ( nodeHasCM ) ( node , CM_INLINE ) )
{
/* HTML4 strict doesn't allow inline content here */
/* but HTML2 does allow img elements as children of body */
if ( nodeIsIMG ( node ) )
TY_ ( ConstrainVersion ) ( doc , ~ VERS_HTML40_STRICT ) ;
else
TY_ ( ConstrainVersion ) ( doc , ~ ( VERS_HTML40_STRICT | VERS_HTML20 ) ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( checkstack & & ! node - > implicit )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
checkstack = no ;
if ( TY_ ( InlineDup ) ( doc , node ) > 0 )
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
mode = MixedContent ;
}
else
{
checkstack = yes ;
mode = IgnoreWhitespace ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > implicit )
TY_ ( Report ) ( doc , body , node , INSERTING_TAG ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( body , node ) ;
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseBody ) ;
memory . original_node = body ;
memory . reentry_node = node ;
memory . register_b_1 = checkstack ;
memory . register_b_2 = iswhitenode ;
memory . mode = mode ;
pushMemory ( doc , memory ) ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseBody with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* discard unexpected tags */
TY_ ( Report ) ( doc , body , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
}
DEBUG_LOG ( SPRTF ( " <<<Exit ParseBody at bottom \n " ) ) ;
return NULL ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseColGroup)
* Parses the ` colgroup ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParseColGroup ) ( TidyDocImpl * doc , Node * colgroup , GetTokenMode ARG_UNUSED ( mode ) )
{
Node * node , * parent ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/*
If we ' re re - entering , then we need to setup from a previous state ,
instead of starting fresh . We can pull what we need from the document ' s
stack .
*/
if ( colgroup = = NULL )
{
TidyParserMemory memory = popMemory ( doc ) ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
colgroup = memory . original_node ;
mode = memory . mode ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseColGroup with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Enter ParseColGroup... \n " ) ) ;
if ( colgroup - > tag - > model & CM_EMPTY )
return NULL ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
{
DEBUG_LOG ( SPRTF ( " ---ParseColGroup got token %s with mode %u \n " , node - > element , IgnoreWhitespace ) ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > tag = = colgroup - > tag & & node - > type = = EndTag )
{
TY_ ( FreeNode ) ( doc , node ) ;
colgroup - > closed = yes ;
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if ( node - > type = = EndTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( nodeIsFORM ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
BadForm ( doc ) ;
TY_ ( Report ) ( doc , colgroup , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
for ( parent = colgroup - > parent ;
parent ! = NULL ;
parent = parent - > parent )
{
if ( node - > tag = = parent - > tag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
}
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( node ) )
{
TY_ ( UngetToken ) ( doc ) ;
return NULL ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( colgroup , node ) )
continue ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* discard unknown tags */
if ( node - > tag = = NULL )
{
TY_ ( Report ) ( doc , colgroup , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( ! nodeIsCOL ( node ) )
{
TY_ ( UngetToken ) ( doc ) ;
return NULL ;
}
2011-11-17 02:44:16 +00:00
if ( node - > type = = EndTag )
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , colgroup , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* node should be <COL> */
TY_ ( InsertNodeAtEnd ) ( colgroup , node ) ;
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseColGroup ) ;
memory . original_node = colgroup ;
memory . reentry_node = node ;
memory . mode = mode ;
pushMemory ( doc , memory ) ;
}
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseColGroup with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
return NULL ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseDatalist)
* Parses the ` datalist ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParseDatalist ) ( TidyDocImpl * doc , Node * field , GetTokenMode ARG_UNUSED ( mode ) )
{
2017-10-07 18:56:01 +00:00
# if defined(ENABLE_DEBUG_LOG)
2021-08-05 12:18:30 +00:00
static int in_parse_datalist = 0 ;
2014-08-03 18:33:29 +00:00
# endif
2021-08-05 12:18:30 +00:00
Lexer * lexer = doc - > lexer ;
Node * node ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( field = = NULL )
{
TidyParserMemory memory = popMemory ( doc ) ;
field = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseDataList with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " Entering ParseDatalist %d... \n " , + + in_parse_datalist ) ) ;
}
lexer - > insert = NULL ; /* defer implicit inline start tags */
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
{
if ( node - > tag = = field - > tag & & node - > type = = EndTag )
{
TY_ ( FreeNode ) ( doc , node ) ;
field - > closed = yes ;
TrimSpaces ( doc , field ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " Exit ParseDatalist 1 %d... \n " , - - in_parse_datalist ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( field , node ) )
continue ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > type = = StartTag & &
( nodeIsOPTION ( node ) | |
nodeIsOPTGROUP ( node ) | |
nodeIsDATALIST ( node ) | |
nodeIsSCRIPT ( node ) )
)
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseDatalist ) ;
memory . original_node = field ;
memory . reentry_node = node ;
memory . reentry_mode = IgnoreWhitespace ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( field , node ) ;
pushMemory ( doc , memory ) ;
return node ;
2011-11-17 02:44:16 +00:00
}
/* discard unexpected tags */
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , field , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , field , node , MISSING_ENDTAG_FOR ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " Exit ParseDatalist 2 %d... \n " , - - in_parse_datalist ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseDefList)
2021-07-28 23:45:57 +00:00
* Parses the ` dl ` tag .
2021-08-05 12:18:30 +00:00
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParseDefList ) ( TidyDocImpl * doc , Node * list , GetTokenMode mode )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
# if defined(ENABLE_DEBUG_LOG)
static int in_parse_deflist = 0 ;
# endif
2011-11-17 02:44:16 +00:00
Lexer * lexer = doc - > lexer ;
2021-08-05 12:18:30 +00:00
Node * node = NULL ;
Node * parent = NULL ; ;
enum parserState {
STATE_INITIAL , /* This is the initial state for every parser. */
STATE_POST_NODEISCENTER , /* To-do after re-entering after checks. */
STATE_COMPLETE , /* Done with the switch. */
} state = STATE_INITIAL ;
if ( list = = NULL )
{
TidyParserMemory memory = popMemory ( doc ) ;
list = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
state = memory . reentry_state ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseDefList with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Entering ParseDefList %d... \n " , + + in_parse_deflist ) ) ;
}
2011-11-17 02:44:16 +00:00
if ( list - > tag - > model & CM_EMPTY )
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
lexer - > insert = NULL ; /* defer implicit inline start tags */
2021-08-05 12:18:30 +00:00
while ( state ! = STATE_COMPLETE )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( state = = STATE_INITIAL )
node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ;
switch ( state )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
case STATE_INITIAL :
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node = = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
state = STATE_COMPLETE ;
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( node - > tag = = list - > tag & & node - > type = = EndTag )
{
TY_ ( FreeNode ) ( doc , node ) ;
list - > closed = yes ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseDefList 1 %d... CM_EMPTY \n " , - - in_parse_deflist ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( list , node ) )
continue ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( node ) )
{
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_DT ) ;
TY_ ( Report ) ( doc , list , node , MISSING_STARTTAG ) ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > tag = = NULL )
{
TY_ ( Report ) ( doc , list , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if ( node - > type = = EndTag )
{
Bool discardIt = no ;
if ( nodeIsFORM ( node ) )
{
BadForm ( doc ) ;
TY_ ( Report ) ( doc , list , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
for ( parent = list - > parent ;
parent ! = NULL ; parent = parent - > parent )
{
/* Do not match across BODY to avoid infinite loop
between ParseBody and this parser ,
See http : //tidy.sf.net/bug/1098012. */
if ( nodeIsBODY ( parent ) )
{
discardIt = yes ;
break ;
}
if ( node - > tag = = parent - > tag )
{
TY_ ( Report ) ( doc , list , node , MISSING_ENDTAG_BEFORE ) ;
TY_ ( UngetToken ) ( doc ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseDefList 2 %d... CM_EMPTY \n " , - - in_parse_deflist ) ) ;
return NULL ;
}
}
if ( discardIt )
{
TY_ ( Report ) ( doc , list , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
}
/* center in a dt or a dl breaks the dl list in two */
if ( nodeIsCENTER ( node ) )
{
if ( list - > content )
TY_ ( InsertNodeAfterElement ) ( list , node ) ;
else /* trim empty dl list */
{
TY_ ( InsertNodeBeforeElement ) ( list , node ) ;
}
/* #426885 - fix by Glenn Carroll 19 Apr 00, and
Gary Dechaines 11 Aug 00 */
/* ParseTag can destroy node, if it finds that
* this < center > is followed immediately by < / center > .
* It ' s awkward but necessary to determine if this
* has happened .
*/
parent = node - > parent ;
/* and parse contents of center */
lexer - > excludeBlocks = no ;
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseDefList ) ;
memory . original_node = list ;
memory . reentry_node = node ;
memory . reentry_state = STATE_POST_NODEISCENTER ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseDefList 3 with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
}
if ( ! ( nodeIsDT ( node ) | | nodeIsDD ( node ) ) )
{
TY_ ( UngetToken ) ( doc ) ;
if ( ! ( node - > tag - > model & ( CM_BLOCK | CM_INLINE ) ) )
{
TY_ ( Report ) ( doc , list , node , TAG_NOT_ALLOWED_IN ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseDefList 3 %d... CM_EMPTY \n " , - - in_parse_deflist ) ) ;
return NULL ;
}
/* if DD appeared directly in BODY then exclude blocks */
if ( ! ( node - > tag - > model & CM_INLINE ) & & lexer - > excludeBlocks )
{
DEBUG_LOG ( SPRTF ( " <<<Exit ParseDefList 4 %d... CM_EMPTY \n " , - - in_parse_deflist ) ) ;
return NULL ;
}
node = TY_ ( InferredTag ) ( doc , TidyTag_DD ) ;
TY_ ( Report ) ( doc , list , node , MISSING_STARTTAG ) ;
}
if ( node - > type = = EndTag )
{
TY_ ( Report ) ( doc , list , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/* node should be <DT> or <DD>*/
TY_ ( InsertNodeAtEnd ) ( list , node ) ;
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseDefList ) ;
memory . original_node = list ;
memory . reentry_node = node ;
memory . reentry_state = STATE_INITIAL ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseDefList 4 with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
} break ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
case STATE_POST_NODEISCENTER :
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
lexer - > excludeBlocks = yes ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* now create a new dl element,
* unless node has been blown away because the
* center was empty , as above .
*/
if ( parent & & parent - > last = = node )
{
list = TY_ ( InferredTag ) ( doc , TidyTag_DL ) ;
TY_ ( InsertNodeAfterElement ) ( node , list ) ;
}
state = STATE_INITIAL ;
continue ;
} break ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
default :
break ;
} /* switch */
} /* while */
TY_ ( Report ) ( doc , list , node , MISSING_ENDTAG_FOR ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseDefList at bottom %d... CM_EMPTY \n " , - - in_parse_deflist ) ) ;
return NULL ;
}
/** MARK: TY_(ParseEmpty)
* Parse empty element nodes .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParseEmpty ) ( TidyDocImpl * doc , Node * element , GetTokenMode mode )
{
Lexer * lexer = doc - > lexer ;
if ( lexer - > isvoyager )
{
Node * node = TY_ ( GetToken ) ( doc , mode ) ;
if ( node )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( ! ( node - > type = = EndTag & & node - > tag = = element - > tag ) )
{
/* TY_(Report)(doc, element, node, ELEMENT_NOT_EMPTY); */
TY_ ( UngetToken ) ( doc ) ;
}
else
{
TY_ ( FreeNode ) ( doc , node ) ;
}
2011-11-17 02:44:16 +00:00
}
}
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseFrameSet)
* Parses the ` frameset ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseFrameSet ) ( TidyDocImpl * doc , Node * frameset , GetTokenMode ARG_UNUSED ( mode ) )
2011-11-17 02:44:16 +00:00
{
Lexer * lexer = doc - > lexer ;
2021-08-05 12:18:30 +00:00
Node * node ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/*
If we ' re re - entering , then we need to setup from a previous state ,
instead of starting fresh . We can pull what we need from the document ' s
stack .
*/
if ( frameset = = NULL )
2016-03-04 18:28:49 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = popMemory ( doc ) ;
node = memory . reentry_node ; /* Throwaway, because we replace it entering the loop. */
frameset = memory . original_node ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseFrameSet with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " <<<Enter ParseFrameSet with %s \n " , frameset - > element ) ) ;
if ( cfg ( doc , TidyAccessibilityCheckLevel ) = = 0 )
{
doc - > badAccess | = BA_USING_FRAMES ;
}
2016-03-04 18:28:49 +00:00
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > tag = = frameset - > tag & & node - > type = = EndTag )
2011-11-17 02:44:16 +00:00
{
TY_ ( FreeNode ) ( doc , node ) ;
2021-08-05 12:18:30 +00:00
frameset - > closed = yes ;
TrimSpaces ( doc , frameset ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
/* deal with comments etc. */
2021-08-05 12:18:30 +00:00
if ( InsertMisc ( frameset , node ) )
2011-11-17 02:44:16 +00:00
continue ;
2021-08-05 12:18:30 +00:00
if ( node - > tag = = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , frameset , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsElement ) ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > tag & & node - > tag - > model & CM_HEAD )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
MoveToHead ( doc , frameset , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
}
2021-08-05 12:18:30 +00:00
if ( nodeIsBODY ( node ) )
2016-04-08 21:08:56 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_NOFRAMES ) ;
TY_ ( Report ) ( doc , frameset , node , INSERTING_TAG ) ;
2017-07-02 19:10:20 +00:00
}
2021-08-05 12:18:30 +00:00
if ( node - > type = = StartTag & & ( node - > tag & & node - > tag - > model & CM_FRAMES ) )
2017-07-02 19:10:20 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( frameset , node ) ;
lexer - > excludeBlocks = no ;
/*
* We don ' t really have to do anything when re - entering , except
* setting up the state when we left . No post - processing means
* this stays simple .
2016-04-08 21:08:56 +00:00
*/
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseFrameSet ) ;
memory . original_node = frameset ;
memory . reentry_node = node ;
memory . mode = MixedContent ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseFrameSet with a node to parse: %s \n " , node - > element ) ) ;
return node ;
2016-04-08 21:08:56 +00:00
}
2021-08-05 12:18:30 +00:00
else if ( node - > type = = StartEndTag & & ( node - > tag & & node - > tag - > model & CM_FRAMES ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( frameset , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* discard unexpected tags */
/* WAI [6.5.1.4] link is being discarded outside of NOFRAME */
if ( nodeIsA ( node ) )
doc - > badAccess | = BA_INVALID_LINK_NOFRAMES ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , frameset , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , frameset , node , MISSING_ENDTAG_FOR ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseHead)
* Parses the ` head ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseHead ) ( TidyDocImpl * doc , Node * head , GetTokenMode ARG_UNUSED ( mode ) )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
Lexer * lexer = doc - > lexer ;
Node * node ;
2021-08-05 12:18:30 +00:00
int HasTitle = 0 ;
int HasBase = 0 ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( head = = NULL )
{
TidyParserMemory memory = popMemory ( doc ) ;
head = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
HasTitle = memory . register_b_1 ;
HasBase = memory . register_b_2 ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseHead with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Enter ParseHead... \n " ) ) ;
}
2021-07-28 23:45:57 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > tag = = head - > tag & & node - > type = = EndTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
head - > closed = yes ;
break ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* find and discard multiple <head> elements */
/* find and discard <html> in <head> elements */
if ( ( node - > tag = = head - > tag | | nodeIsHTML ( node ) ) & & node - > type = = StartTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , head , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( node ) )
{
/*\ Issue #132 - avoid warning for missing body tag,
* if configured to - - omit - otpional - tags yes
* Issue # 314 - and if - - show - body - only
\ */
if ( ! cfgBool ( doc , TidyOmitOptionalTags ) & &
! showingBodyOnly ( doc ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , head , node , TAG_NOT_ALLOWED_IN ) ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
break ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( node - > type = = ProcInsTag & & node - > element & &
TY_ ( tmbstrcmp ) ( node - > element , " xml-stylesheet " ) = = 0 )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , head , node , TAG_NOT_ALLOWED_IN ) ;
TY_ ( InsertNodeBeforeElement ) ( TY_ ( FindHTML ) ( doc ) , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( head , node ) )
2011-11-17 02:44:16 +00:00
continue ;
2021-08-05 12:18:30 +00:00
if ( node - > type = = DocTypeTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
InsertDocType ( doc , head , node ) ;
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* discard unknown tags */
if ( node - > tag = = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , head , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/*
2021-08-05 12:18:30 +00:00
if it doesn ' t belong in the head then
treat as implicit end of head and deal
with as part of the body
2011-11-17 02:44:16 +00:00
*/
2021-08-05 12:18:30 +00:00
if ( ! ( node - > tag - > model & CM_HEAD ) )
{
/* #545067 Implicit closing of head broken - warn only for XHTML input */
if ( lexer - > isvoyager )
TY_ ( Report ) ( doc , head , node , TAG_NOT_ALLOWED_IN ) ;
TY_ ( UngetToken ) ( doc ) ;
break ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsElement ) ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( nodeIsTITLE ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
+ + HasTitle ;
if ( HasTitle > 1 )
TY_ ( Report ) ( doc , head , node ,
head ?
TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS ) ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
else if ( nodeIsBASE ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
+ + HasBase ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( HasBase > 1 )
TY_ ( Report ) ( doc , head , node ,
head ?
TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS ) ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( head , node ) ;
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseHead ) ;
memory . original_node = head ;
memory . reentry_node = node ;
memory . register_b_1 = HasTitle ;
memory . register_b_2 = HasBase ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseHead with a node to parse: %s \n " , node - > element ) ) ;
return node ;
2011-11-17 02:44:16 +00:00
}
}
2021-08-05 12:18:30 +00:00
/* discard unexpected text nodes and end tags */
TY_ ( Report ) ( doc , head , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseHead at bottom \n " ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseHTML)
* Parses the ` html ` tag . At this point , other root - level stuff ( doctype ,
* comments ) are already set up , and here we handle all of the complexities
* of things such as frameset documents , etc .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseHTML ) ( TidyDocImpl * doc , Node * html , GetTokenMode mode )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
Node * node = NULL ;
Node * head = NULL ;
Node * frameset = NULL ;
Node * noframes = NULL ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
enum parserState {
STATE_INITIAL , /* This is the initial state for every parser. */
STATE_COMPLETE , /* Complete! */
STATE_PRE_BODY , /* In this state, we'll consider frames vs. body. */
STATE_PARSE_BODY , /* In this state, we can parse the body. */
STATE_PARSE_HEAD , /* In this state, we will setup head for parsing. */
STATE_PARSE_HEAD_REENTER , /* Resume here after parsing head. */
STATE_PARSE_NOFRAMES , /* In this state, we can parse noframes content. */
STATE_PARSE_NOFRAMES_REENTER , /* In this state, we can restore more state. */
STATE_PARSE_FRAMESET , /* In this state, we will parse frameset content. */
STATE_PARSE_FRAMESET_REENTER , /* We need to cleanup some things after parsing frameset. */
} state = STATE_INITIAL ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
# if defined(ENABLE_DEBUG_LOG)
static int parser_depth = 0 ;
static int parser_count = 0 ;
SPRTF ( " >>>Entering ParseHTML, count: %d, depth %d \n " , + + parser_count , + + parser_depth ) ;
# endif
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( SetOptionBool ) ( doc , TidyXmlTags , no ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/*
If we ' re re - entering , then we need to setup from a previous state ,
instead of starting fresh . We can pull what we need from the document ' s
stack .
*/
if ( html = = NULL )
{
TidyParserMemory memory = popMemory ( doc ) ;
node = memory . reentry_node ;
mode = memory . reentry_mode ;
state = memory . reentry_state ;
html = memory . original_node ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/*
This main loop pulls tokens from the lexer until we ' re out of tokens ,
or until there ' s no more work to do .
*/
while ( state ! = STATE_COMPLETE )
{
/*
We don ' t want to get the next token unless we ' re
done with this one . Using this flag is much quicker
than using ` UngetToken ( ) ` every time we want to keep
the token .
*/
if ( state = = STATE_INITIAL | | state = = STATE_PRE_BODY )
node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
switch ( state )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/**************************************************************
This case is all about finding a head tag and dealing with
cases were we don ' t , so that we can move on to parsing a head
tag .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
case STATE_INITIAL :
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/*
The only way we can possibly be here is if the lexer
had nothing to give us . Thus we ' ll create our own
head , and set the signal to start parsing it .
*/
if ( node = = NULL )
{
node = TY_ ( InferredTag ) ( doc , TidyTag_HEAD ) ;
state = STATE_PARSE_HEAD ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* We found exactly what we expected: head. */
if ( nodeIsHEAD ( node ) )
{
state = STATE_PARSE_HEAD ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* We did not expect to find an html closing tag here! */
if ( html & & ( node - > tag = = html - > tag ) & & ( node - > type = = EndTag ) )
{
TY_ ( Report ) ( doc , html , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* Find and discard multiple <html> elements. */
if ( html & & ( node - > tag = = html - > tag ) & & ( node - > type = = StartTag ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , html , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* Deal with comments, etc. */
if ( InsertMisc ( html , node ) )
continue ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/*
At this point , we didn ' t find a head tag , so put the
token back and create our own head tag , so we can
move on .
*/
2011-11-17 02:44:16 +00:00
TY_ ( UngetToken ) ( doc ) ;
2021-08-05 12:18:30 +00:00
node = TY_ ( InferredTag ) ( doc , TidyTag_HEAD ) ;
state = STATE_PARSE_HEAD ;
2011-11-17 02:44:16 +00:00
continue ;
2021-08-05 12:18:30 +00:00
} break ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/**************************************************************
This case determines whether we ' re dealing with body or
frameset + noframes , and sets things up accordingly .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
case STATE_PRE_BODY :
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node = = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( frameset = = NULL ) /* Implied body. */
{
node = TY_ ( InferredTag ) ( doc , TidyTag_BODY ) ;
state = STATE_PARSE_BODY ;
} else {
state = STATE_COMPLETE ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* Robustly handle html tags. */
if ( node - > tag = = html - > tag )
{
if ( node - > type ! = StartTag & & frameset = = NULL )
TY_ ( Report ) ( doc , html , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* Deal with comments, etc. */
if ( InsertMisc ( html , node ) )
continue ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* If frameset document, coerce <body> to <noframes> */
if ( nodeIsBODY ( node ) )
{
if ( node - > type ! = StartTag )
{
TY_ ( Report ) ( doc , html , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( cfg ( doc , TidyAccessibilityCheckLevel ) = = 0 )
{
if ( frameset ! = NULL )
{
TY_ ( UngetToken ) ( doc ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( noframes = = NULL )
{
noframes = TY_ ( InferredTag ) ( doc , TidyTag_NOFRAMES ) ;
TY_ ( InsertNodeAtEnd ) ( frameset , noframes ) ;
TY_ ( Report ) ( doc , html , noframes , INSERTING_TAG ) ;
}
else
{
if ( noframes - > type = = StartEndTag )
noframes - > type = StartTag ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
state = STATE_PARSE_NOFRAMES ;
continue ;
}
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( ConstrainVersion ) ( doc , ~ VERS_FRAMESET ) ;
state = STATE_PARSE_BODY ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* Flag an error if we see more than one frameset. */
if ( nodeIsFRAMESET ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > type ! = StartTag )
{
TY_ ( Report ) ( doc , html , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( frameset ! = NULL )
TY_ ( Report ) ( doc , html , node , DUPLICATE_FRAMESET ) ;
else
frameset = node ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
state = STATE_PARSE_FRAMESET ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* If not a frameset document coerce <noframes> to <body>. */
if ( nodeIsNOFRAMES ( node ) )
{
if ( node - > type ! = StartTag )
{
TY_ ( Report ) ( doc , html , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( frameset = = NULL )
{
TY_ ( Report ) ( doc , html , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_BODY ) ;
state = STATE_PARSE_BODY ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( noframes = = NULL )
{
noframes = node ;
TY_ ( InsertNodeAtEnd ) ( frameset , noframes ) ;
state = STATE_PARSE_NOFRAMES ;
}
else
{
TY_ ( FreeNode ) ( doc , node ) ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* Deal with some other element that we're not expecting. */
if ( TY_ ( nodeIsElement ) ( node ) )
{
if ( node - > tag & & node - > tag - > model & CM_HEAD )
{
MoveToHead ( doc , html , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* Discard illegal frame element following a frameset. */
if ( frameset ! = NULL & & nodeIsFRAME ( node ) )
{
TY_ ( Report ) ( doc , html , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
}
2011-11-17 02:44:16 +00:00
TY_ ( UngetToken ) ( doc ) ;
2021-08-05 12:18:30 +00:00
/* Insert other content into noframes element. */
if ( frameset )
{
if ( noframes = = NULL )
{
noframes = TY_ ( InferredTag ) ( doc , TidyTag_NOFRAMES ) ;
TY_ ( InsertNodeAtEnd ) ( frameset , noframes ) ;
}
else
{
TY_ ( Report ) ( doc , html , node , NOFRAMES_CONTENT ) ;
if ( noframes - > type = = StartEndTag )
noframes - > type = StartTag ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( ConstrainVersion ) ( doc , VERS_FRAMESET ) ;
state = STATE_PARSE_NOFRAMES ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
node = TY_ ( InferredTag ) ( doc , TidyTag_BODY ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* Issue #132 - disable inserting BODY tag warning
BUT only if NOT - - show - body - only yes */
if ( ! showingBodyOnly ( doc ) )
TY_ ( Report ) ( doc , html , node , INSERTING_TAG ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( ConstrainVersion ) ( doc , ~ VERS_FRAMESET ) ;
state = STATE_PARSE_BODY ;
continue ;
} break ;
/**************************************************************
In this case , we ' re ready to parse the head , and move on to
look for the body or body alternative .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
case STATE_PARSE_HEAD :
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseHTML ) ;
memory . mode = mode ;
memory . original_node = html ;
memory . reentry_node = node ;
memory . reentry_mode = mode ;
memory . reentry_state = STATE_PARSE_HEAD_REENTER ;
TY_ ( InsertNodeAtEnd ) ( html , node ) ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseHTML at STATE_PARSE_HEAD, count: %d, depth %d \n " , parser_count , - - parser_depth ) ) ;
return node ;
} break ;
case STATE_PARSE_HEAD_REENTER :
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
head = node ;
state = STATE_PRE_BODY ;
} break ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/**************************************************************
In this case , we can finally parse a body .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
case STATE_PARSE_BODY :
{
TidyParserMemory memory = { 0 } ;
memory . identity = NULL ; /* we don't need to reenter */
memory . mode = mode ;
memory . original_node = html ;
memory . reentry_node = NULL ;
memory . reentry_mode = mode ;
memory . reentry_state = STATE_COMPLETE ;
TY_ ( InsertNodeAtEnd ) ( html , node ) ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseHTML at STATE_PARSE_BODY, count: %d, depth %d \n " , parser_count , - - parser_depth ) ) ;
return node ;
} break ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/**************************************************************
In this case , we will parse noframes . If necessary , the
node is already inserted in the proper spot .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
case STATE_PARSE_NOFRAMES :
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseHTML ) ;
memory . mode = mode ;
memory . original_node = html ;
memory . reentry_node = frameset ;
memory . reentry_mode = mode ;
memory . reentry_state = STATE_PARSE_NOFRAMES_REENTER ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseHTML at STATE_PARSE_NOFRAMES, count: %d, depth %d \n " , parser_count , - - parser_depth ) ) ;
return noframes ;
} break ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
case STATE_PARSE_NOFRAMES_REENTER :
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
frameset = node ;
state = STATE_PRE_BODY ;
} break ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/**************************************************************
In this case , we parse the frameset , and look for noframes
content to merge later if necessary .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
case STATE_PARSE_FRAMESET :
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseHTML ) ;
memory . mode = mode ;
memory . original_node = html ;
memory . reentry_node = frameset ;
memory . reentry_mode = mode ;
memory . reentry_state = STATE_PARSE_FRAMESET_REENTER ;
TY_ ( InsertNodeAtEnd ) ( html , node ) ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseHTML at STATE_PARSE_FRAMESET, count: %d, depth %d \n " , parser_count , - - parser_depth ) ) ;
return node ;
} break ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
case ( STATE_PARSE_FRAMESET_REENTER ) :
{
frameset = node ;
/*
See if it includes a noframes element so that
we can merge subsequent noframes elements .
*/
for ( node = frameset - > content ; node ; node = node - > next )
{
if ( nodeIsNOFRAMES ( node ) )
noframes = node ;
}
state = STATE_PRE_BODY ;
} break ;
2021-07-28 23:45:57 +00:00
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/**************************************************************
We really shouldn ' t get here , but if we do , finish nicely .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
default :
{
state = STATE_COMPLETE ;
}
} /* switch */
} /* while */
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseHTML at bottom, count: %d, depth %d \n " , parser_count , - - parser_depth ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseInline)
* Parse inline element nodes .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParseInline ) ( TidyDocImpl * doc , Node * element , GetTokenMode mode )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
# if defined(ENABLE_DEBUG_LOG)
static int in_parse_inline = 0 ;
# endif
2011-11-17 02:44:16 +00:00
Lexer * lexer = doc - > lexer ;
2021-08-05 12:18:30 +00:00
Node * node = NULL ;
Node * parent = NULL ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( element = = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = popMemory ( doc ) ;
element = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
mode = memory . reentry_mode ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseInline with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Entering ParseInline %d... \n " , + + in_parse_inline ) ) ;
if ( element - > tag - > model & CM_EMPTY )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 1 %d... \n " , - - in_parse_inline ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/*
ParseInline is used for some block level elements like H1 to H6
For such elements we need to insert inline emphasis tags currently
on the inline stack . For Inline elements , we normally push them
onto the inline stack provided they aren ' t implicit or OBJECT / APPLET .
This test is carried out in PushInline and PopInline , see istack . c
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
InlineDup ( . . . ) is not called for elements with a CM_MIXED ( inline and
block ) content model , e . g . < del > or < ins > , otherwise constructs like
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
< p > 111 < a name = ' foo ' > 222 < del > 333 < / del > 444 < / a > 555 < / p >
< p > 111 < span > 222 < del > 333 < / del > 444 < / span > 555 < / p >
< p > 111 < em > 222 < del > 333 < / del > 444 < / em > 555 < / p >
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
will get corrupted .
*/
if ( ( TY_ ( nodeHasCM ) ( element , CM_BLOCK ) | | nodeIsDT ( element ) ) & &
! TY_ ( nodeHasCM ) ( element , CM_MIXED ) )
TY_ ( InlineDup ) ( doc , NULL ) ;
else if ( TY_ ( nodeHasCM ) ( element , CM_INLINE ) )
TY_ ( PushInline ) ( doc , element ) ;
if ( nodeIsNOBR ( element ) )
doc - > badLayout | = USING_NOBR ;
else if ( nodeIsFONT ( element ) )
doc - > badLayout | = USING_FONT ;
/* Inline elements may or may not be within a preformatted element */
if ( mode ! = Preformatted )
mode = MixedContent ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , mode ) ) ! = NULL )
2014-08-03 18:33:29 +00:00
{
2021-08-05 12:18:30 +00:00
/* end tag for current element */
if ( node - > tag = = element - > tag & & node - > type = = EndTag )
2014-08-03 18:33:29 +00:00
{
2021-08-05 12:18:30 +00:00
if ( element - > tag - > model & CM_INLINE )
TY_ ( PopInline ) ( doc , node ) ;
2014-08-03 18:33:29 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
2014-08-03 18:33:29 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2014-08-03 18:33:29 +00:00
2021-08-05 12:18:30 +00:00
/*
if a font element wraps an anchor and nothing else
then move the font element inside the anchor since
otherwise it won ' t alter the anchor text color
*/
if ( nodeIsFONT ( element ) & &
element - > content & & element - > content = = element - > last )
{
Node * child = element - > content ;
2014-08-03 18:33:29 +00:00
2021-08-05 12:18:30 +00:00
if ( nodeIsA ( child ) )
{
child - > parent = element - > parent ;
child - > next = element - > next ;
child - > prev = element - > prev ;
2014-08-03 18:33:29 +00:00
2021-08-05 12:18:30 +00:00
element - > next = NULL ;
element - > prev = NULL ;
element - > parent = child ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
element - > content = child - > content ;
element - > last = child - > last ;
child - > content = element ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( FixNodeLinks ) ( child ) ;
TY_ ( FixNodeLinks ) ( element ) ;
}
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
element - > closed = yes ;
TrimSpaces ( doc , element ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 2 %d... \n " , - - in_parse_inline ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* <u>...<u> map 2nd <u> to </u> if 1st is explicit */
/* (see additional conditions below) */
/* otherwise emphasis nesting is probably unintentional */
/* big, small, sub, sup have cumulative effect to leave them alone */
if ( node - > type = = StartTag
& & node - > tag = = element - > tag
& & TY_ ( IsPushed ) ( doc , node )
& & ! node - > implicit
& & ! element - > implicit
& & node - > tag & & ( node - > tag - > model & CM_INLINE )
& & ! nodeIsA ( node )
& & ! nodeIsFONT ( node )
& & ! nodeIsBIG ( node )
& & ! nodeIsSMALL ( node )
& & ! nodeIsSUB ( node )
& & ! nodeIsSUP ( node )
& & ! nodeIsQ ( node )
& & ! nodeIsSPAN ( node )
& & cfgBool ( doc , TidyCoerceEndTags )
2011-11-17 02:44:16 +00:00
)
{
2021-08-05 12:18:30 +00:00
/* proceeds only if "node" does not have any attribute and
follows a text node not finishing with a space */
if ( element - > content ! = NULL & & node - > attributes = = NULL
& & TY_ ( nodeIsText ) ( element - > last )
& & ! TY_ ( TextNodeEndWithSpace ) ( doc - > lexer , element - > last ) )
{
TY_ ( Report ) ( doc , element , node , COERCE_TO_ENDTAG ) ;
node - > type = EndTag ;
TY_ ( UngetToken ) ( doc ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > attributes = = NULL | | element - > attributes = = NULL )
TY_ ( Report ) ( doc , element , node , NESTED_EMPHASIS ) ;
}
else if ( TY_ ( IsPushed ) ( doc , node ) & & node - > type = = StartTag & &
nodeIsQ ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/*\
* Issue # 215 - such nested quotes are NOT a problem if HTML5 , so
* only issue this warning if NOT HTML5 mode .
\ */
if ( TY_ ( HTMLVersion ) ( doc ) ! = HT50 )
{
TY_ ( Report ) ( doc , element , node , NESTED_QUOTATION ) ;
}
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( node ) )
2011-11-17 02:44:16 +00:00
{
/* only called for 1st child */
2021-08-05 12:18:30 +00:00
if ( element - > content = = NULL & & ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > start > = node - > end )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
/* mixed content model so allow text */
if ( InsertMisc ( element , node ) )
2011-11-17 02:44:16 +00:00
continue ;
2021-08-05 12:18:30 +00:00
/* deal with HTML tags */
if ( nodeIsHTML ( node ) )
{
if ( TY_ ( nodeIsElement ) ( node ) )
{
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* otherwise infer end of inline element */
TY_ ( UngetToken ) ( doc ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 3 %d... \n " , - - in_parse_inline ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* within <dt> or <pre> map <p> to <br> */
if ( nodeIsP ( node ) & &
node - > type = = StartTag & &
( ( mode & Preformatted ) | |
nodeIsDT ( element ) | |
DescendantOf ( element , TidyTag_DT )
)
)
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
node - > tag = TY_ ( LookupTagDef ) ( TidyTag_BR ) ;
TidyDocFree ( doc , node - > element ) ;
node - > element = TY_ ( tmbstrdup ) ( doc - > allocator , " br " ) ;
TrimSpaces ( doc , element ) ;
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
/* <p> allowed within <address> in HTML 4.01 Transitional */
if ( nodeIsP ( node ) & &
node - > type = = StartTag & &
nodeIsADDRESS ( element ) )
{
TY_ ( ConstrainVersion ) ( doc , ~ VERS_HTML40_STRICT ) ;
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
( * node - > tag - > parser ) ( doc , node , mode ) ;
2011-11-17 02:44:16 +00:00
continue ;
2021-08-05 12:18:30 +00:00
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* ignore unknown and PARAM tags */
if ( node - > tag = = NULL | | nodeIsPARAM ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
if ( nodeIsBR ( node ) & & node - > type = = EndTag )
node - > type = StartTag ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > type = = EndTag )
{
/* coerce </br> to <br> */
if ( nodeIsBR ( node ) )
node - > type = StartTag ;
else if ( nodeIsP ( node ) )
{
/* coerce unmatched </p> to <br><br> */
if ( ! DescendantOf ( element , TidyTag_P ) )
{
TY_ ( CoerceNode ) ( doc , node , TidyTag_BR , no , no ) ;
TrimSpaces ( doc , element ) ;
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_BR ) ;
TY_ ( InsertNodeAtEnd ) ( element , node ) ; /* todo: check this */
continue ;
}
}
else if ( TY_ ( nodeHasCM ) ( node , CM_INLINE )
& & ! nodeIsA ( node )
& & ! TY_ ( nodeHasCM ) ( node , CM_OBJECT )
& & TY_ ( nodeHasCM ) ( element , CM_INLINE ) )
{
/* allow any inline end tag to end current element */
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* http://tidy.sf.net/issue/1426419 */
/* but, like the browser, retain an earlier inline element.
This is implemented by setting the lexer into a mode
where it gets tokens from the inline stack rather than
from the input stream . Check if the scenerio fits . */
if ( ! nodeIsA ( element )
& & ( node - > tag ! = element - > tag )
& & TY_ ( IsPushed ) ( doc , node )
& & TY_ ( IsPushed ) ( doc , element ) )
{
/* we have something like
< b > bold < i > bold and italic < / b > italics < / i > */
if ( TY_ ( SwitchInline ) ( doc , element , node ) )
{
TY_ ( Report ) ( doc , element , node , NON_MATCHING_ENDTAG ) ;
TY_ ( UngetToken ) ( doc ) ; /* put this back */
TY_ ( InlineDup1 ) ( doc , NULL , element ) ; /* dupe the <i>, after </b> */
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 4 %d... \n " , - - in_parse_inline ) ) ;
return NULL ; /* close <i>, but will re-open it, after </b> */
}
}
TY_ ( PopInline ) ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( ! nodeIsA ( element ) )
{
if ( nodeIsA ( node ) & & node - > tag ! = element - > tag )
{
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_BEFORE ) ;
TY_ ( UngetToken ) ( doc ) ;
}
else
{
TY_ ( Report ) ( doc , element , node , NON_MATCHING_ENDTAG ) ;
TY_ ( FreeNode ) ( doc , node ) ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 5 %d... \n " , - - in_parse_inline ) ) ;
return NULL ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* if parent is <a> then discard unexpected inline end tag */
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
} /* special case </tr> etc. for stuff moved in front of table */
else if ( lexer - > exiled
& & ( TY_ ( nodeHasCM ) ( node , CM_TABLE ) | | nodeIsTABLE ( node ) ) )
2016-02-29 17:49:15 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
TrimSpaces ( doc , element ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 6 %d... \n " , - - in_parse_inline ) ) ;
return NULL ;
2015-02-01 14:41:52 +00:00
}
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* allow any header tag to end current header */
if ( TY_ ( nodeHasCM ) ( node , CM_HEADING ) & & TY_ ( nodeHasCM ) ( element , CM_HEADING ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > tag = = element - > tag )
{
TY_ ( Report ) ( doc , element , node , NON_MATCHING_ENDTAG ) ;
TY_ ( FreeNode ) ( doc , node ) ;
}
else
{
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_BEFORE ) ;
TY_ ( UngetToken ) ( doc ) ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 7 %d... \n " , - - in_parse_inline ) ) ;
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
2011-11-17 02:44:16 +00:00
/*
2021-08-05 12:18:30 +00:00
an < A > tag to ends any open < A > element
but < A href = . . . > is mapped to < / A > < A href = . . . >
2011-11-17 02:44:16 +00:00
*/
2021-08-05 12:18:30 +00:00
/* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
/* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
if ( nodeIsA ( node ) & & ! node - > implicit & &
( nodeIsA ( element ) | | DescendantOf ( element , TidyTag_A ) ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/* coerce <a> to </a> unless it has some attributes */
/* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
/* other fixes by Dave Raggett */
/* if (node->attributes == NULL) */
if ( node - > type ! = EndTag & & node - > attributes = = NULL
& & cfgBool ( doc , TidyCoerceEndTags ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
node - > type = EndTag ;
TY_ ( Report ) ( doc , element , node , COERCE_TO_ENDTAG ) ;
/* TY_(PopInline)( doc, node ); */
TY_ ( UngetToken ) ( doc ) ;
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_BEFORE ) ;
/* TY_(PopInline)( doc, element ); */
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 8 %d... \n " , - - in_parse_inline ) ) ;
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( element - > tag - > model & CM_HEADING )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( nodeIsCENTER ( node ) | | nodeIsDIV ( node ) )
{
if ( ! TY_ ( nodeIsElement ) ( node ) )
{
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , element , node , TAG_NOT_ALLOWED_IN ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* insert center as parent if heading is empty */
if ( element - > content = = NULL )
{
InsertNodeAsParent ( element , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* split heading and make center parent of 2nd part */
TY_ ( InsertNodeAfterElement ) ( element , node ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
element = TY_ ( CloneNode ) ( doc , element ) ;
TY_ ( InsertNodeAtEnd ) ( node , element ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
if ( nodeIsHR ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( ! TY_ ( nodeIsElement ) ( node ) )
{
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , element , node , TAG_NOT_ALLOWED_IN ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* insert hr before heading if heading is empty */
if ( element - > content = = NULL )
{
TY_ ( InsertNodeBeforeElement ) ( element , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* split heading and insert hr before 2nd part */
TY_ ( InsertNodeAfterElement ) ( element , node ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
element = TY_ ( CloneNode ) ( doc , element ) ;
TY_ ( InsertNodeAfterElement ) ( node , element ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( nodeIsDT ( element ) )
{
if ( nodeIsHR ( node ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
Node * dd ;
if ( ! TY_ ( nodeIsElement ) ( node ) )
{
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , element , node , TAG_NOT_ALLOWED_IN ) ;
dd = TY_ ( InferredTag ) ( doc , TidyTag_DD ) ;
/* insert hr within dd before dt if dt is empty */
if ( element - > content = = NULL )
{
TY_ ( InsertNodeBeforeElement ) ( element , dd ) ;
TY_ ( InsertNodeAtEnd ) ( dd , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
2021-08-05 12:18:30 +00:00
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* split dt and insert hr within dd before 2nd part */
TY_ ( InsertNodeAfterElement ) ( element , dd ) ;
TY_ ( InsertNodeAtEnd ) ( dd , node ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
element = TY_ ( CloneNode ) ( doc , element ) ;
TY_ ( InsertNodeAfterElement ) ( dd , element ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
2011-11-17 02:44:16 +00:00
/*
2021-08-05 12:18:30 +00:00
if this is the end tag for an ancestor element
then infer end tag for this element
2011-11-17 02:44:16 +00:00
*/
2021-08-05 12:18:30 +00:00
if ( node - > type = = EndTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
for ( parent = element - > parent ;
parent ! = NULL ; parent = parent - > parent )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > tag = = parent - > tag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( ! ( element - > tag - > model & CM_OPT ) & & ! element - > implicit )
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_BEFORE ) ;
if ( TY_ ( IsPushedLast ) ( doc , element , node ) )
TY_ ( PopInline ) ( doc , element ) ;
TY_ ( UngetToken ) ( doc ) ;
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 9 %d... \n " , - - in_parse_inline ) ) ;
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
}
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/*\
* block level tags end this element
* Issue # 333 - There seems an exception if the element is a ' span ' ,
* and the node just collected is a ' meta ' . The ' meta ' can not have
* CM_INLINE added , nor can the ' span ' have CM_MIXED added without
* big consequences .
* There may be other exceptions to be added . . .
\ */
if ( ! ( node - > tag - > model & CM_INLINE ) & &
! ( element - > tag - > model & CM_MIXED ) & &
! ( nodeIsSPAN ( element ) & & nodeIsMETA ( node ) ) )
{
if ( ! TY_ ( nodeIsElement ) ( node ) )
{
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-08-05 12:18:30 +00:00
/* HTML5 */
if ( nodeIsDATALIST ( element ) ) {
TY_ ( ConstrainVersion ) ( doc , ~ VERS_HTML5 ) ;
} else
if ( ! ( element - > tag - > model & CM_OPT ) )
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_BEFORE ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > tag - > model & CM_HEAD & & ! ( node - > tag - > model & CM_BLOCK ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
MoveToHead ( doc , element , node ) ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
/*
prevent anchors from propagating into block tags
except for headings h1 to h6
*/
if ( nodeIsA ( element ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > tag & & ! ( node - > tag - > model & CM_HEADING ) )
TY_ ( PopInline ) ( doc , element ) ;
else if ( ! ( element - > content ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( DiscardElement ) ( doc , element ) ;
2011-11-17 02:44:16 +00:00
TY_ ( UngetToken ) ( doc ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 10 %d... \n " , - - in_parse_inline ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
}
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
if ( ! ( mode & Preformatted ) )
TrimSpaces ( doc , element ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 11 %d... \n " , - - in_parse_inline ) ) ;
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* parse inline element */
2011-11-17 02:44:16 +00:00
if ( TY_ ( nodeIsElement ) ( node ) )
{
2021-08-05 12:18:30 +00:00
if ( node - > implicit )
TY_ ( Report ) ( doc , element , node , INSERTING_TAG ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* trim white space before <br> */
if ( nodeIsBR ( node ) )
TrimSpaces ( doc , element ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseInline ) ;
memory . original_node = element ;
memory . reentry_node = node ;
memory . mode = mode ;
memory . reentry_mode = mode ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseInline 1 with a node to parse: %s \n " , node - > element ) ) ;
return node ;
2011-11-17 02:44:16 +00:00
}
}
/* discard unexpected tags */
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , element , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
if ( ! ( element - > tag - > model & CM_OPT ) )
TY_ ( Report ) ( doc , element , node , MISSING_ENDTAG_FOR ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseInline 12 %d... \n " , - - in_parse_inline ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseList)
* Parses list tags .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParseList ) ( TidyDocImpl * doc , Node * list , GetTokenMode ARG_UNUSED ( mode ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
# if defined(ENABLE_DEBUG_LOG)
static int in_parse_list = 0 ;
# endif
2011-11-17 02:44:16 +00:00
Lexer * lexer = doc - > lexer ;
2021-08-05 12:18:30 +00:00
Node * node = NULL ;
Node * parent = NULL ;
Node * lastli = NULL ; ;
Bool wasblock = no ;
Bool nodeisOL = nodeIsOL ( list ) ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( list = = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = popMemory ( doc ) ;
list = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseList with %s \n " , node - > element ) ) ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
else
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " >>>Entering ParseList %d... \n " , + + in_parse_list ) ) ;
if ( list - > tag - > model & CM_EMPTY )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseList 1 %d... CM_EMPTY \n " , - - in_parse_list ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
}
lexer - > insert = NULL ; /* defer implicit inline start tags */
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
{
Bool foundLI = no ;
if ( node - > tag = = list - > tag & & node - > type = = EndTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
list - > closed = yes ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseList 2 %d... Endtag \n " , - - in_parse_list ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( list , node ) )
continue ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > type ! = TextNode & & node - > tag = = NULL )
{
TY_ ( Report ) ( doc , list , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-08-05 12:18:30 +00:00
if ( lexer & & ( node - > type = = TextNode ) )
{
uint ch , ix = node - > start ;
/* Issue #572 - Skip whitespace. */
while ( ix < node - > end & & ( ch = ( lexer - > lexbuf [ ix ] & 0xff ) )
& & ( ch = = ' ' | | ch = = ' \t ' | | ch = = ' \r ' | | ch = = ' \n ' ) )
+ + ix ;
if ( ix > = node - > end )
{
/* Issue #572 - Discard if ALL whitespace. */
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if ( node - > type = = EndTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( nodeIsFORM ( node ) )
{
BadForm ( doc ) ;
TY_ ( Report ) ( doc , list , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeHasCM ) ( node , CM_INLINE ) )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , list , node , DISCARDING_UNEXPECTED ) ;
TY_ ( PopInline ) ( doc , node ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
for ( parent = list - > parent ;
parent ! = NULL ; parent = parent - > parent )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/* Do not match across BODY to avoid infinite loop
between ParseBody and this parser ,
See http : //tidy.sf.net/bug/1053626. */
if ( nodeIsBODY ( parent ) )
break ;
if ( node - > tag = = parent - > tag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , list , node , MISSING_ENDTAG_BEFORE ) ;
2011-11-17 02:44:16 +00:00
TY_ ( UngetToken ) ( doc ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseList 3 %d... No End Tag \n " , - - in_parse_list ) ) ;
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
}
TY_ ( Report ) ( doc , list , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
if ( ! nodeIsLI ( node ) & & nodeisOL )
{
/* Issue #572 - A <ol><li> can have nested <ol> elements */
foundLI = FindLastLI ( list , & lastli ) ; /* find last <li> */
}
if ( nodeIsLI ( node ) | | ( TY_ ( IsHTML5Mode ) ( doc ) & & ! foundLI ) )
{
/* node is <LI> OR
Issue # 396 - A < ul > can have Zero or more < li > elements
*/
TY_ ( InsertNodeAtEnd ) ( list , node ) ;
}
else
{
TY_ ( UngetToken ) ( doc ) ;
if ( TY_ ( nodeHasCM ) ( node , CM_BLOCK ) & & lexer - > excludeBlocks )
{
TY_ ( Report ) ( doc , list , node , MISSING_ENDTAG_BEFORE ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseList 4 %d... No End Tag \n " , - - in_parse_list ) ) ;
return NULL ;
}
/* http://tidy.sf.net/issue/1316307 */
/* In exiled mode, return so table processing can continue. */
else if ( lexer - > exiled
& & ( TY_ ( nodeHasCM ) ( node , CM_TABLE | CM_ROWGRP | CM_ROW )
| | nodeIsTABLE ( node ) ) )
{
DEBUG_LOG ( SPRTF ( " <<<Exit ParseList 5 %d... exiled \n " , - - in_parse_list ) ) ;
return NULL ;
}
/* http://tidy.sf.net/issue/836462
If " list " is an unordered list , insert the next tag within
the last < li > to preserve the numbering to match the visual
rendering of most browsers . */
if ( nodeIsOL ( list ) & & FindLastLI ( list , & lastli ) )
{
/* Create a node for error reporting */
node = TY_ ( InferredTag ) ( doc , TidyTag_LI ) ;
TY_ ( Report ) ( doc , list , node , MISSING_STARTTAG ) ;
TY_ ( FreeNode ) ( doc , node ) ;
node = lastli ;
2011-11-17 02:44:16 +00:00
}
else
{
2021-08-05 12:18:30 +00:00
/* Add an inferred <li> */
wasblock = TY_ ( nodeHasCM ) ( node , CM_BLOCK ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_LI ) ;
/* Add "display: inline" to avoid a blank line after <li> with
Internet Explorer . See http : //tidy.sf.net/issue/836462 */
TY_ ( AddStyleProperty ) ( doc , node ,
wasblock
? " list-style: none; display: inline "
: " list-style: none "
) ;
TY_ ( Report ) ( doc , list , node , MISSING_STARTTAG ) ;
TY_ ( InsertNodeAtEnd ) ( list , node ) ;
2011-11-17 02:44:16 +00:00
}
}
2021-08-05 12:18:30 +00:00
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseList ) ;
memory . original_node = list ;
memory . reentry_node = node ;
memory . mode = IgnoreWhitespace ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseList with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , list , node , MISSING_ENDTAG_FOR ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseList 6 %d... missing end tag \n " , - - in_parse_list ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseNamespace)
* Act as a generic XML ( sub ) tree parser : collect each node and add it
* to the DOM , without any further validation . It ' s useful for tags that
* have XML - like content , such as ` svg ` and ` math ` .
*
* @ note Perhaps this is poorly named , as we ' re not parsing the namespace
* of a particular tag , but a tag with XML - like content .
*
* @ todo Add schema - or other - hierarchy - definition - based validation
* of the subtree here .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParseNamespace ) ( TidyDocImpl * doc , Node * basenode , GetTokenMode mode )
2011-11-17 02:44:16 +00:00
{
Lexer * lexer = doc - > lexer ;
Node * node ;
2021-08-05 12:18:30 +00:00
Node * parent = basenode ;
uint istackbase ;
AttVal * av ; /* #130 MathML attr and entity fix! */
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* a la <table>: defer popping elements off the inline stack */
TY_ ( DeferDup ) ( doc ) ;
istackbase = lexer - > istackbase ;
lexer - > istackbase = lexer - > istacksize ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
mode = OtherNamespace ; /* Preformatted; IgnoreWhitespace; */
while ( ( node = TY_ ( GetToken ) ( doc , mode ) ) ! = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/*
fix check to skip action in InsertMisc for regular / empty
nodes , which we don ' t want here . . .
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
The way we do it here is by checking and processing everything
and only what remains goes into InsertMisc ( )
*/
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* is this a close tag? And does it match the current parent node? */
if ( node - > type = = EndTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/*
to prevent end tags flowing from one ' alternate namespace ' we
check this in two phases : first we check if the tag is a
descendant of the current node , and when it is , we check whether
it is the end tag for a node / within / or / outside / the basenode .
*/
Bool outside ;
Node * mp = FindMatchingDescendant ( parent , node , basenode , & outside ) ;
if ( mp ! = NULL )
{
/*
when mp ! = parent as we might expect ,
infer end tags until we ' hit ' the matched
parent or the basenode
*/
Node * n ;
for ( n = parent ;
n ! = NULL & & n ! = basenode - > parent & & n ! = mp ;
n = n - > parent )
{
/* n->implicit = yes; */
n - > closed = yes ;
TY_ ( Report ) ( doc , n - > parent , n , MISSING_ENDTAG_BEFORE ) ;
}
/* Issue #369 - Since 'assert' is DEBUG only, and there are
simple cases where these can be fired , removing them
pending feedback from the original author !
assert ( outside = = no ? n = = mp : 1 ) ;
assert ( outside = = yes ? n = = basenode - > parent : 1 ) ;
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
if ( outside = = no )
{
/* EndTag for a node within the basenode subtree. Roll on... */
if ( n )
n - > closed = yes ;
TY_ ( FreeNode ) ( doc , node ) ;
node = n ;
parent = node ? node - > parent : NULL ;
}
else
{
/* EndTag for a node outside the basenode subtree: let the caller handle that. */
TY_ ( UngetToken ) ( doc ) ;
node = basenode ;
parent = node - > parent ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/* when we've arrived at the end-node for the base node, it's quitting time */
if ( node = = basenode )
{
lexer - > istackbase = istackbase ;
assert ( basenode & & basenode - > closed = = yes ) ;
return NULL ;
}
}
else
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/* unmatched close tag: report an error and discard */
/* TY_(Report)(doc, parent, node, NON_MATCHING_ENDTAG); Issue #308 - Seems wrong warning! */
TY_ ( Report ) ( doc , parent , node , DISCARDING_UNEXPECTED ) ;
assert ( parent ) ;
/* assert(parent->tag != node->tag); Issue #308 - Seems would always be true! */
TY_ ( FreeNode ) ( doc , node ) ; /* Issue #308 - Discard unexpected end tag memory */
2011-11-17 02:44:16 +00:00
}
}
2021-08-05 12:18:30 +00:00
else if ( node - > type = = StartTag )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/* #130 MathML attr and entity fix!
care if it has attributes , and ' accidently ' any of those attributes match known */
for ( av = node - > attributes ; av ; av = av - > next )
{
av - > dict = 0 ; /* does something need to be freed? */
}
/* add another child to the current parent */
TY_ ( InsertNodeAtEnd ) ( parent , node ) ;
parent = node ;
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
else
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
/* #130 MathML attr and entity fix!
care if it has attributes , and ' accidently ' any of those attributes match known */
for ( av = node - > attributes ; av ; av = av - > next )
{
av - > dict = 0 ; /* does something need to be freed? */
}
TY_ ( InsertNodeAtEnd ) ( parent , node ) ;
2011-11-17 02:44:16 +00:00
}
}
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , basenode - > parent , basenode , MISSING_ENDTAG_FOR ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseNoFrames)
* Parses the ` noframes ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseNoFrames ) ( TidyDocImpl * doc , Node * noframes , GetTokenMode mode )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
Lexer * lexer = doc - > lexer ;
2021-07-28 23:45:57 +00:00
Node * node = NULL ;
2021-08-05 12:18:30 +00:00
Bool body_seen = no ;
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
enum parserState {
STATE_INITIAL , /* This is the initial state for every parser. */
STATE_POST_NODEISBODY , /* To-do after re-entering after checks. */
STATE_COMPLETE , /* Done with the switch. */
} state = STATE_INITIAL ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/*
If we ' re re - entering , then we need to setup from a previous state ,
instead of starting fresh . We can pull what we need from the document ' s
stack .
*/
2021-08-05 12:18:30 +00:00
if ( noframes = = NULL )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
TidyParserMemory memory = popMemory ( doc ) ;
2021-08-05 12:18:30 +00:00
node = memory . reentry_node ; /* Throwaway, because we replace it entering the loop anyway.*/
noframes = memory . original_node ;
2021-07-28 23:45:57 +00:00
state = memory . reentry_state ;
2021-08-05 12:18:30 +00:00
body_seen = memory . register_b_1 ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseNoFrames with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Enter ParseNoFrames with %s \n " , noframes - > element ) ) ;
if ( cfg ( doc , TidyAccessibilityCheckLevel ) = = 0 )
{
doc - > badAccess | = BA_USING_NOFRAMES ;
}
2011-11-17 02:44:16 +00:00
}
2021-08-05 12:18:30 +00:00
mode = IgnoreWhitespace ;
2021-07-28 23:45:57 +00:00
while ( state ! = STATE_COMPLETE )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( state = = STATE_INITIAL )
{
node = TY_ ( GetToken ) ( doc , mode ) ;
DEBUG_LOG ( SPRTF ( " ---ParseNoFrames got token %s with mode %u \n " , node - > element , mode ) ) ;
}
2021-07-28 23:45:57 +00:00
switch ( state )
{
case STATE_INITIAL :
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node = = NULL )
2011-11-17 02:44:16 +00:00
{
2021-08-05 12:18:30 +00:00
state = STATE_COMPLETE ;
2011-11-17 02:44:16 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
if ( node - > tag = = noframes - > tag & & node - > type = = EndTag )
2021-07-28 23:45:57 +00:00
{
TY_ ( FreeNode ) ( doc , node ) ;
2021-08-05 12:18:30 +00:00
noframes - > closed = yes ;
TrimSpaces ( doc , noframes ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseNoFrames 1. \n " ) ) ;
return NULL ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
if ( nodeIsFRAME ( node ) | | nodeIsFRAMESET ( node ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TrimSpaces ( doc , noframes ) ;
if ( node - > type = = EndTag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , noframes , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ; /* Throw it away */
2021-07-28 23:45:57 +00:00
}
else
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , noframes , node , MISSING_ENDTAG_BEFORE ) ;
TY_ ( UngetToken ) ( doc ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseNoFrames 2. \n " ) ) ;
return NULL ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
if ( nodeIsHTML ( node ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsElement ) ( node ) )
TY_ ( Report ) ( doc , noframes , node , DISCARDING_UNEXPECTED ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( noframes , node ) )
continue ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( nodeIsBODY ( node ) & & node - > type = = StartTag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseNoFrames ) ;
memory . original_node = noframes ;
memory . reentry_node = node ;
memory . reentry_state = STATE_POST_NODEISBODY ;
memory . register_b_1 = lexer - > seenEndBody ;
memory . mode = IgnoreWhitespace ;
TY_ ( InsertNodeAtEnd ) ( noframes , node ) ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseNoFrames with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
/* implicit body element inferred */
if ( TY_ ( nodeIsText ) ( node ) | | ( node - > tag & & node - > type ! = EndTag ) )
{
Node * body = TY_ ( FindBody ) ( doc ) ;
if ( body | | lexer - > seenEndBody )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
if ( body = = NULL )
{
TY_ ( Report ) ( doc , noframes , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
if ( TY_ ( nodeIsText ) ( node ) )
{
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_P ) ;
TY_ ( Report ) ( doc , noframes , node , CONTENT_AFTER_BODY ) ;
}
TY_ ( InsertNodeAtEnd ) ( body , node ) ;
2021-07-28 23:45:57 +00:00
}
else
{
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_BODY ) ;
if ( cfgBool ( doc , TidyXmlOut ) )
TY_ ( Report ) ( doc , noframes , node , INSERTING_TAG ) ;
TY_ ( InsertNodeAtEnd ) ( noframes , node ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseNoFrames ) ;
memory . original_node = noframes ;
memory . reentry_node = node ;
memory . mode = IgnoreWhitespace ; /*MixedContent*/
memory . reentry_state = STATE_INITIAL ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseNoFrames with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
/* discard unexpected end tags */
TY_ ( Report ) ( doc , noframes , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2021-07-28 23:45:57 +00:00
} break ;
2021-08-05 12:18:30 +00:00
case STATE_POST_NODEISBODY :
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
/* fix for bug http://tidy.sf.net/bug/887259 */
if ( body_seen & & TY_ ( FindBody ) ( doc ) ! = node )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( CoerceNode ) ( doc , node , TidyTag_DIV , no , no ) ;
MoveNodeToBody ( doc , node ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
state = STATE_INITIAL ;
continue ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
} break ;
2021-07-28 23:45:57 +00:00
default :
2021-08-05 12:18:30 +00:00
break ;
2021-07-28 23:45:57 +00:00
} /* switch */
} /* while */
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , noframes , node , MISSING_ENDTAG_FOR ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseNoFrames at bottom. \n " ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
}
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseOptGroup)
* Parses the ` optgroup ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseOptGroup ) ( TidyDocImpl * doc , Node * field , GetTokenMode ARG_UNUSED ( mode ) )
2021-07-28 23:45:57 +00:00
{
Lexer * lexer = doc - > lexer ;
2021-08-05 12:18:30 +00:00
Node * node ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( field = = NULL )
2021-07-28 23:45:57 +00:00
{
TidyParserMemory memory = popMemory ( doc ) ;
2021-08-05 12:18:30 +00:00
field = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseOptGroup with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Enter ParseOptGroup \n " ) ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
lexer - > insert = NULL ; /* defer implicit inline start tags */
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
{
if ( node - > tag = = field - > tag & & node - > type = = EndTag )
{
TY_ ( FreeNode ) ( doc , node ) ;
field - > closed = yes ;
TrimSpaces ( doc , field ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( field , node ) )
continue ;
if ( node - > type = = StartTag & &
( nodeIsOPTION ( node ) | | nodeIsOPTGROUP ( node ) ) )
{
TidyParserMemory memory = { 0 } ;
if ( nodeIsOPTGROUP ( node ) )
TY_ ( Report ) ( doc , field , node , CANT_BE_NESTED ) ;
TY_ ( InsertNodeAtEnd ) ( field , node ) ;
memory . identity = TY_ ( ParseOptGroup ) ;
memory . original_node = field ;
memory . reentry_node = node ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseOptGroup with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
/* discard unexpected tags */
TY_ ( Report ) ( doc , field , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParsePre)
* Parses the ` pre ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParsePre ) ( TidyDocImpl * doc , Node * pre , GetTokenMode ARG_UNUSED ( mode ) )
{
Node * node = NULL ;
enum parserState {
STATE_INITIAL , /* This is the initial state for every parser. */
STATE_RENTRY_ACTION , /* To-do after re-entering after checks. */
STATE_COMPLETE , /* Done with the switch. */
} state = STATE_INITIAL ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( pre = = NULL )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = popMemory ( doc ) ;
pre = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
state = memory . reentry_state ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParsePre with %s \n " , node - > element ) ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
else
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " >>>Enter ParsePre \n " ) ) ;
if ( pre - > tag - > model & CM_EMPTY )
{
DEBUG_LOG ( SPRTF ( " <<<Exiting ParsePre 1 \n " ) ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
TY_ ( InlineDup ) ( doc , NULL ) ; /* tell lexer to insert inlines if needed */
2021-07-28 23:45:57 +00:00
while ( state ! = STATE_COMPLETE )
{
2021-08-05 12:18:30 +00:00
if ( state = = STATE_INITIAL )
node = TY_ ( GetToken ) ( doc , Preformatted ) ;
2021-07-28 23:45:57 +00:00
switch ( state )
{
case STATE_INITIAL :
{
2021-08-05 12:18:30 +00:00
if ( node = = NULL )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
state = STATE_COMPLETE ;
2021-07-28 23:45:57 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
if ( node - > type = = EndTag & &
( node - > tag = = pre - > tag | | DescendantOf ( pre , TagId ( node ) ) ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
if ( nodeIsBODY ( node ) | | nodeIsHTML ( node ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , pre , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2021-07-28 23:45:57 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
if ( node - > tag = = pre - > tag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
2021-07-28 23:45:57 +00:00
}
else
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , pre , node , MISSING_ENDTAG_BEFORE ) ;
TY_ ( UngetToken ) ( doc ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
pre - > closed = yes ;
TrimSpaces ( doc , pre ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParsePre 2 \n " ) ) ;
return NULL ;
2021-07-28 23:45:57 +00:00
}
if ( TY_ ( nodeIsText ) ( node ) )
{
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( pre , node ) ;
2021-07-28 23:45:57 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( pre , node ) )
2021-07-28 23:45:57 +00:00
continue ;
2021-08-05 12:18:30 +00:00
if ( node - > tag = = NULL )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , pre , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2021-07-28 23:45:57 +00:00
continue ;
}
2021-08-05 12:18:30 +00:00
/* strip unexpected tags */
if ( ! PreContent ( doc , node ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
/* fix for http://tidy.sf.net/bug/772205 */
if ( node - > type = = EndTag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
/* http://tidy.sf.net/issue/1590220 */
if ( doc - > lexer - > exiled
& & ( TY_ ( nodeHasCM ) ( node , CM_TABLE ) | | nodeIsTABLE ( node ) ) )
{
TY_ ( UngetToken ) ( doc ) ;
TrimSpaces ( doc , pre ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParsePre 3 \n " ) ) ;
return NULL ;
}
TY_ ( Report ) ( doc , pre , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
/* http://tidy.sf.net/issue/1590220 */
else if ( TY_ ( nodeHasCM ) ( node , CM_TABLE | CM_ROW )
| | nodeIsTABLE ( node ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
if ( ! doc - > lexer - > exiled )
/* No missing close warning if exiled. */
TY_ ( Report ) ( doc , pre , node , MISSING_ENDTAG_BEFORE ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParsePre 4 \n " ) ) ;
return NULL ;
2021-07-28 23:45:57 +00:00
}
/*
2021-08-05 12:18:30 +00:00
This is basically what Tidy 04 August 2000 did and far more accurate
with respect to browser behaivour than the code commented out above .
Tidy could try to propagate the < pre > into each disallowed child where
< pre > is allowed in order to replicate some browsers behaivour , but
there are a lot of exceptions , e . g . Internet Explorer does not propagate
< pre > into table cells while Mozilla does . Opera 6 never propagates
< pre > into blocklevel elements while Opera 7 behaves much like Mozilla .
Tidy behaves thus mostly like Opera 6 except for nested < pre > elements
which are handled like Mozilla takes them ( Opera6 closes all < pre > after
the first < / pre > ) .
There are similar issues like replacing < p > in < pre > with < br > , for
example
< pre > . . . < p > . . . < / pre > ( Input )
< pre > . . . < br > . . . < / pre > ( Tidy )
< pre > . . . < br > . . . < / pre > ( Opera 7 and Internet Explorer )
< pre > . . . < br > < br > . . . < / pre > ( Opera 6 and Mozilla )
< pre > . . . < p > . . . < / p > . . . < / pre > ( Input )
< pre > . . . < br > . . . . . . < / pre > ( Tidy , BUG ! )
< pre > . . . < br > . . . < br > . . . < / pre > ( Internet Explorer )
< pre > . . . < br > < br > . . . < br > < br > . . . < / pre > ( Mozilla , Opera 6 )
< pre > . . . < br > . . . < br > < br > . . . < / pre > ( Opera 7 )
or something similar , they could also be closing the < pre > and propagate
the < pre > into the newly opened < p > .
Todo : IMG , OBJECT , APPLET , BIG , SMALL , SUB , SUP , FONT , and BASEFONT are
disallowed in < pre > , Tidy neither detects this nor does it perform any
cleanup operation . Tidy should at least issue a warning if it encounters
such constructs .
Todo : discarding < / p > is abviously a bug , it should be replaced by < br > .
*/
TY_ ( InsertNodeAfterElement ) ( pre , node ) ;
TY_ ( Report ) ( doc , pre , node , MISSING_ENDTAG_BEFORE ) ;
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParsePre ) ;
memory . original_node = pre ;
memory . reentry_node = node ;
memory . reentry_state = STATE_RENTRY_ACTION ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParsePre with a node to parse: %s \n " , node - > element ) ) ;
return node ;
2021-07-28 23:45:57 +00:00
}
}
2021-08-05 12:18:30 +00:00
if ( nodeIsP ( node ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > type = = StartTag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , pre , node , USING_BR_INPLACE_OF ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* trim white space before <p> in <pre>*/
TrimSpaces ( doc , pre ) ;
/* coerce both <p> and </p> to <br> */
TY_ ( CoerceNode ) ( doc , node , TidyTag_BR , no , no ) ;
TY_ ( FreeAttrs ) ( doc , node ) ; /* discard align attribute etc. */
TY_ ( InsertNodeAtEnd ) ( pre , node ) ;
2021-07-28 23:45:57 +00:00
}
else
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , pre , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
continue ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsElement ) ( node ) )
2021-07-28 23:45:57 +00:00
{
/* trim white space before <br> */
if ( nodeIsBR ( node ) )
2021-08-05 12:18:30 +00:00
TrimSpaces ( doc , pre ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( pre , node ) ;
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParsePre ) ;
memory . original_node = pre ;
2021-07-28 23:45:57 +00:00
memory . reentry_node = node ;
2021-08-05 12:18:30 +00:00
memory . reentry_state = STATE_INITIAL ;
2021-07-28 23:45:57 +00:00
pushMemory ( doc , memory ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exiting ParsePre with a node to parse: %s \n " , node - > element ) ) ;
2021-07-28 23:45:57 +00:00
return node ;
}
}
/* discard unexpected tags */
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , pre , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
} break ;
case STATE_RENTRY_ACTION :
{
Node * newnode = TY_ ( InferredTag ) ( doc , TidyTag_PRE ) ;
TY_ ( Report ) ( doc , pre , newnode , INSERTING_TAG ) ;
pre = newnode ;
TY_ ( InsertNodeAfterElement ) ( node , pre ) ;
state = STATE_INITIAL ;
2021-07-28 23:45:57 +00:00
continue ;
2021-08-05 12:18:30 +00:00
} break ;
2021-07-28 23:45:57 +00:00
default :
2021-08-05 12:18:30 +00:00
break ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
} /* switch */
} /* while */
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , pre , node , MISSING_ENDTAG_FOR ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParsePre at bottom \n " ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
}
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseRow)
* Parses the ` row ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseRow ) ( TidyDocImpl * doc , Node * row , GetTokenMode ARG_UNUSED ( mode ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
# if defined(ENABLE_DEBUG_LOG)
static int in_parse_row = 0 ;
# endif
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
Lexer * lexer = doc - > lexer ;
Node * node = NULL ;
Bool exclude_state = no ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
enum parserState {
STATE_INITIAL , /* This is the initial state for every parser. */
STATE_POST_NOT_ENDTAG , /* To-do after re-entering after !EndTag checks. */
STATE_POST_TD_TH , /* To-do after re-entering after TD/TH checks. */
STATE_COMPLETE , /* Done with the switch. */
} state = STATE_INITIAL ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( row = = NULL )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = popMemory ( doc ) ;
row = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
state = memory . reentry_state ;
exclude_state = memory . register_b_1 ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseRow with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Entering ParseRow %d... \n " , + + in_parse_row ) ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( row - > tag - > model & CM_EMPTY )
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
while ( state ! = STATE_COMPLETE )
{
if ( state = = STATE_INITIAL )
node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ;
switch ( state )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
case STATE_INITIAL :
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node = = NULL )
{
state = STATE_COMPLETE ;
continue ;
}
if ( node - > tag = = row - > tag )
{
if ( node - > type = = EndTag )
{
TY_ ( FreeNode ) ( doc , node ) ;
row - > closed = yes ;
FixEmptyRow ( doc , row ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRow 1 %d... CM_EMPTY \n " , - - in_parse_row ) ) ;
return NULL ;
}
/* New row start implies end of current row */
TY_ ( UngetToken ) ( doc ) ;
FixEmptyRow ( doc , row ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRow 2 %d... CM_EMPTY \n " , - - in_parse_row ) ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
/*
2021-08-05 12:18:30 +00:00
if this is the end tag for an ancestor element
then infer end tag for this element
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
if ( node - > type = = EndTag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
if ( ( TY_ ( nodeHasCM ) ( node , CM_HTML | CM_TABLE ) | | nodeIsTABLE ( node ) )
& & DescendantOf ( row , TagId ( node ) ) )
{
TY_ ( UngetToken ) ( doc ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRow 3 %d... CM_EMPTY \n " , - - in_parse_row ) ) ;
return NULL ;
}
if ( nodeIsFORM ( node ) | | TY_ ( nodeHasCM ) ( node , CM_BLOCK | CM_INLINE ) )
{
if ( nodeIsFORM ( node ) )
BadForm ( doc ) ;
TY_ ( Report ) ( doc , row , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
if ( nodeIsTD ( node ) | | nodeIsTH ( node ) )
{
TY_ ( Report ) ( doc , row , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( row , node ) )
continue ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* discard unknown tags */
if ( node - > tag = = NULL & & node - > type ! = TextNode )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , row , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* discard unexpected <table> element */
if ( nodeIsTABLE ( node ) )
{
TY_ ( Report ) ( doc , row , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
/* THEAD, TFOOT or TBODY */
if ( TY_ ( nodeHasCM ) ( node , CM_ROWGRP ) )
2021-07-28 23:45:57 +00:00
{
TY_ ( UngetToken ) ( doc ) ;
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRow 4 %d... CM_EMPTY \n " , - - in_parse_row ) ) ;
return NULL ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
if ( node - > type = = EndTag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , row , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
/*
if text or inline or block move before table
if head content move to head
*/
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > type ! = EndTag )
{
if ( nodeIsFORM ( node ) )
{
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_TD ) ;
TY_ ( Report ) ( doc , row , node , MISSING_STARTTAG ) ;
}
else if ( TY_ ( nodeIsText ) ( node )
| | TY_ ( nodeHasCM ) ( node , CM_BLOCK | CM_INLINE ) )
{
MoveBeforeTable ( doc , row , node ) ;
TY_ ( Report ) ( doc , row , node , TAG_NOT_ALLOWED_IN ) ;
lexer - > exiled = yes ;
exclude_state = lexer - > excludeBlocks ;
lexer - > excludeBlocks = no ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > type ! = TextNode )
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseRow ) ;
memory . original_node = row ;
memory . reentry_node = node ;
memory . reentry_state = STATE_POST_NOT_ENDTAG ;
memory . register_b_1 = exclude_state ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseRow 1 with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
lexer - > exiled = no ;
lexer - > excludeBlocks = exclude_state ;
continue ;
}
else if ( node - > tag - > model & CM_HEAD )
{
TY_ ( Report ) ( doc , row , node , TAG_NOT_ALLOWED_IN ) ;
MoveToHead ( doc , row , node ) ;
continue ;
}
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( nodeIsTD ( node ) | | nodeIsTH ( node ) ) )
{
TY_ ( Report ) ( doc , row , node , TAG_NOT_ALLOWED_IN ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* node should be <TD> or <TH> */
TY_ ( InsertNodeAtEnd ) ( row , node ) ;
exclude_state = lexer - > excludeBlocks ;
lexer - > excludeBlocks = no ;
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseRow ) ;
memory . original_node = row ;
memory . reentry_node = node ;
memory . reentry_state = STATE_POST_TD_TH ;
memory . register_b_1 = exclude_state ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseRow 2 with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
} break ;
case STATE_POST_NOT_ENDTAG :
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
lexer - > exiled = no ;
lexer - > excludeBlocks = exclude_state ; /* capture this in stack. */
state = STATE_INITIAL ;
continue ;
} break ;
case STATE_POST_TD_TH :
{
lexer - > excludeBlocks = exclude_state ; /* capture this in stack. */
/* pop inline stack */
while ( lexer - > istacksize > lexer - > istackbase )
TY_ ( PopInline ) ( doc , NULL ) ;
state = STATE_INITIAL ;
continue ;
} break ;
default :
break ;
} /* switch */
} /* while */
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRow at bottom %d... CM_EMPTY \n " , - - in_parse_row ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
}
/** MARK: TY_(ParseRowGroup)
* Parses the ` rowgroup ` tag .
2021-08-05 12:18:30 +00:00
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseRowGroup ) ( TidyDocImpl * doc , Node * rowgroup , GetTokenMode ARG_UNUSED ( mode ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
# if defined(ENABLE_DEBUG_LOG)
static int in_parse_rowgroup = 0 ;
# endif
Lexer * lexer = doc - > lexer ;
Node * node = NULL ;
Node * parent = NULL ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
enum parserState {
STATE_INITIAL , /* This is the initial state for every parser. */
STATE_POST_NOT_TEXTNODE , /* To-do after re-entering after checks. */
STATE_COMPLETE , /* Done with the switch. */
} state = STATE_INITIAL ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( rowgroup = = NULL )
{
TidyParserMemory memory = popMemory ( doc ) ;
rowgroup = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
state = memory . reentry_state ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseRowGroup with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Entering ParseRowGroup %d... \n " , + + in_parse_rowgroup ) ) ;
if ( rowgroup - > tag - > model & CM_EMPTY )
{
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRowGroup 1 %d \n " , - - in_parse_rowgroup ) ) ;
return NULL ;
}
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
while ( state ! = STATE_COMPLETE )
{
if ( state = = STATE_INITIAL )
node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ;
switch ( state )
{
case STATE_INITIAL :
{
TidyParserMemory memory = { 0 } ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( node = = NULL )
{
state = STATE_COMPLETE ;
continue ;
}
if ( node - > tag = = rowgroup - > tag )
{
if ( node - > type = = EndTag )
{
rowgroup - > closed = yes ;
TY_ ( FreeNode ) ( doc , node ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRowGroup 2 %d \n " , - - in_parse_rowgroup ) ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRowGroup 3 %d \n " , - - in_parse_rowgroup ) ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* if </table> infer end tag */
if ( nodeIsTABLE ( node ) & & node - > type = = EndTag )
{
TY_ ( UngetToken ) ( doc ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRowGroup 4 %d \n " , - - in_parse_rowgroup ) ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( rowgroup , node ) )
continue ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* discard unknown tags */
if ( node - > tag = = NULL & & node - > type ! = TextNode )
{
TY_ ( Report ) ( doc , rowgroup , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/*
if TD or TH then infer < TR >
if text or inline or block move before table
if head content move to head
*/
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > type ! = EndTag )
{
if ( nodeIsTD ( node ) | | nodeIsTH ( node ) )
{
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_TR ) ;
TY_ ( Report ) ( doc , rowgroup , node , MISSING_STARTTAG ) ;
}
else if ( TY_ ( nodeIsText ) ( node )
| | TY_ ( nodeHasCM ) ( node , CM_BLOCK | CM_INLINE ) )
{
MoveBeforeTable ( doc , rowgroup , node ) ;
TY_ ( Report ) ( doc , rowgroup , node , TAG_NOT_ALLOWED_IN ) ;
lexer - > exiled = yes ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > type ! = TextNode )
{
memory . identity = TY_ ( ParseRowGroup ) ;
memory . original_node = rowgroup ;
memory . reentry_node = node ;
memory . reentry_state = STATE_POST_NOT_TEXTNODE ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseRowGroup 1 with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
state = STATE_POST_NOT_TEXTNODE ;
continue ;
}
else if ( node - > tag - > model & CM_HEAD )
{
TY_ ( Report ) ( doc , rowgroup , node , TAG_NOT_ALLOWED_IN ) ;
MoveToHead ( doc , rowgroup , node ) ;
continue ;
}
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/*
if this is the end tag for ancestor element
then infer end tag for this element
*/
if ( node - > type = = EndTag )
{
if ( nodeIsFORM ( node ) | | TY_ ( nodeHasCM ) ( node , CM_BLOCK | CM_INLINE ) )
{
if ( nodeIsFORM ( node ) )
BadForm ( doc ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , rowgroup , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( nodeIsTR ( node ) | | nodeIsTD ( node ) | | nodeIsTH ( node ) )
{
TY_ ( Report ) ( doc , rowgroup , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
for ( parent = rowgroup - > parent ;
parent ! = NULL ;
parent = parent - > parent )
{
if ( node - > tag = = parent - > tag )
{
TY_ ( UngetToken ) ( doc ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRowGroup 5 %d \n " , - - in_parse_rowgroup ) ) ;
return NULL ;
}
}
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/*
if THEAD , TFOOT or TBODY then implied end tag
*/
if ( node - > tag - > model & CM_ROWGRP )
{
if ( node - > type ! = EndTag )
{
TY_ ( UngetToken ) ( doc ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRowGroup 6 %d \n " , - - in_parse_rowgroup ) ) ;
return NULL ;
}
}
if ( node - > type = = EndTag )
{
TY_ ( Report ) ( doc , rowgroup , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
if ( ! nodeIsTR ( node ) )
{
node = TY_ ( InferredTag ) ( doc , TidyTag_TR ) ;
TY_ ( Report ) ( doc , rowgroup , node , MISSING_STARTTAG ) ;
TY_ ( UngetToken ) ( doc ) ;
}
/* node should be <TR> */
TY_ ( InsertNodeAtEnd ) ( rowgroup , node ) ;
memory . identity = TY_ ( ParseRowGroup ) ;
memory . original_node = rowgroup ;
memory . reentry_node = node ;
memory . reentry_state = STATE_INITIAL ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseRowGroup 2 with a node to parse: %s \n " , node - > element ) ) ;
return node ;
} break ;
case STATE_POST_NOT_TEXTNODE :
{
lexer - > exiled = no ;
state = STATE_INITIAL ;
continue ;
} break ;
default :
break ;
} /* switch */
} /* while */
DEBUG_LOG ( SPRTF ( " <<<Exit ParseRowGroup at bottom %d \n " , - - in_parse_rowgroup ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
}
/** MARK: TY_(ParseScript)
* Parses the ` script ` tag .
*
* @ todo This isn ' t quite right for CDATA content as it recognises tags
* within the content and parses them accordingly . This will unfortunately
* screw up scripts which include :
* < + letter
* < + !
* < + ?
* < + / + letter
2021-08-05 12:18:30 +00:00
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseScript ) ( TidyDocImpl * doc , Node * script , GetTokenMode ARG_UNUSED ( mode ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
Node * node = NULL ;
doc - > lexer - > parent = script ;
node = TY_ ( GetToken ) ( doc , CdataContent ) ;
doc - > lexer - > parent = NULL ;
if ( node )
{
TY_ ( InsertNodeAtEnd ) ( script , node ) ;
}
else
{
/* handle e.g. a document like "<script>" */
TY_ ( Report ) ( doc , script , NULL , MISSING_ENDTAG_FOR ) ;
return NULL ;
}
node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ;
if ( ! ( node & & node - > type = = EndTag & & node - > tag & &
node - > tag - > id = = script - > tag - > id ) )
{
TY_ ( Report ) ( doc , script , node , MISSING_ENDTAG_FOR ) ;
if ( node )
TY_ ( UngetToken ) ( doc ) ;
}
else
{
TY_ ( FreeNode ) ( doc , node ) ;
}
2021-07-28 23:45:57 +00:00
return NULL ;
}
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseSelect)
* Parses the ` select ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParseSelect ) ( TidyDocImpl * doc , Node * field , GetTokenMode ARG_UNUSED ( mode ) )
{
# if defined(ENABLE_DEBUG_LOG)
static int in_parse_select = 0 ;
# endif
Lexer * lexer = doc - > lexer ;
Node * node ;
if ( field = = NULL )
{
TidyParserMemory memory = popMemory ( doc ) ;
field = memory . original_node ;
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseSelect with %s \n " , node - > element ) ) ;
}
else
{
DEBUG_LOG ( SPRTF ( " >>>Entering ParseSelect %d... \n " , + + in_parse_select ) ) ;
}
lexer - > insert = NULL ; /* defer implicit inline start tags */
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
{
if ( node - > tag = = field - > tag & & node - > type = = EndTag )
{
TY_ ( FreeNode ) ( doc , node ) ;
field - > closed = yes ;
TrimSpaces ( doc , field ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseSelect 1 %d... \n " , - - in_parse_select ) ) ;
return NULL ;
}
/* deal with comments etc. */
if ( InsertMisc ( field , node ) )
continue ;
if ( node - > type = = StartTag & &
( nodeIsOPTION ( node ) | |
nodeIsOPTGROUP ( node ) | |
nodeIsDATALIST ( node ) | |
nodeIsSCRIPT ( node ) )
)
{
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseSelect ) ;
memory . original_node = field ;
memory . reentry_node = node ;
TY_ ( InsertNodeAtEnd ) ( field , node ) ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseSelect with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
/* discard unexpected tags */
TY_ ( Report ) ( doc , field , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
}
TY_ ( Report ) ( doc , field , node , MISSING_ENDTAG_FOR ) ;
DEBUG_LOG ( SPRTF ( " <<<Exit ParseSelect 2 %d... \n " , - - in_parse_select ) ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
}
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseTableTag)
* Parses the ` table ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseTableTag ) ( TidyDocImpl * doc , Node * table , GetTokenMode ARG_UNUSED ( mode ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
# if defined(ENABLE_DEBUG_LOG)
static int in_parse_table = 0 ;
# endif
2021-07-28 23:45:57 +00:00
Lexer * lexer = doc - > lexer ;
2021-08-05 12:18:30 +00:00
Node * node , * parent ;
uint istackbase ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( table = = NULL )
2021-07-28 23:45:57 +00:00
{
TidyParserMemory memory = popMemory ( doc ) ;
2021-08-05 12:18:30 +00:00
node = memory . reentry_node ; /* Throwaway, as main loop overrwrites anyway. */
table = memory . original_node ;
lexer - > exiled = memory . register_b_1 ;
DEBUG_LOG ( SPRTF ( " >>>Re-Enter ParseTableTag with %s \n " , node - > element ) ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
else
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " >>>Entering ParseTableTag %d... \n " , + + in_parse_table ) ) ;
TY_ ( DeferDup ) ( doc ) ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
istackbase = lexer - > istackbase ;
lexer - > istackbase = lexer - > istacksize ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
{
DEBUG_LOG ( SPRTF ( " ---ParseTableTag got token %s with mode %u \n " , node - > element , IgnoreWhitespace ) ) ;
if ( node - > tag = = table - > tag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > type = = EndTag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
}
else
{
/* Issue #498 - If a <table> in a <table>
* just close the current table , and issue a
* warning . The previous action was to discard
* this second < table >
*/
TY_ ( UngetToken ) ( doc ) ;
TY_ ( Report ) ( doc , table , node , TAG_NOT_ALLOWED_IN ) ;
}
lexer - > istackbase = istackbase ;
table - > closed = yes ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseTableTag 1 %d... EndTag \n " , - - in_parse_table ) ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( table , node ) )
continue ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* discard unknown tags */
if ( node - > tag = = NULL & & node - > type ! = TextNode )
{
TY_ ( Report ) ( doc , table , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* if TD or TH or text or inline or block then infer <TR> */
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > type ! = EndTag )
{
if ( nodeIsTD ( node ) | | nodeIsTH ( node ) | | nodeIsTABLE ( node ) )
{
TY_ ( UngetToken ) ( doc ) ;
node = TY_ ( InferredTag ) ( doc , TidyTag_TR ) ;
TY_ ( Report ) ( doc , table , node , MISSING_STARTTAG ) ;
}
else if ( TY_ ( nodeIsText ) ( node ) | | TY_ ( nodeHasCM ) ( node , CM_BLOCK | CM_INLINE ) )
{
TY_ ( InsertNodeBeforeElement ) ( table , node ) ;
TY_ ( Report ) ( doc , table , node , TAG_NOT_ALLOWED_IN ) ;
lexer - > exiled = yes ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > type ! = TextNode )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TidyParserMemory memory = { 0 } ;
memory . identity = TY_ ( ParseTableTag ) ;
memory . original_node = table ;
memory . reentry_node = node ;
memory . register_b_1 = no ; /* later, lexer->exiled = no */
memory . mode = IgnoreWhitespace ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseTableTag with a node to parse: %s \n " , node - > element ) ) ;
return node ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
lexer - > exiled = no ;
continue ;
}
else if ( node - > tag - > model & CM_HEAD )
{
MoveToHead ( doc , table , node ) ;
continue ;
}
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/*
if this is the end tag for an ancestor element
then infer end tag for this element
*/
if ( node - > type = = EndTag )
{
if ( nodeIsFORM ( node ) )
{
BadForm ( doc ) ;
TY_ ( Report ) ( doc , table , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* best to discard unexpected block/inline end tags */
if ( TY_ ( nodeHasCM ) ( node , CM_TABLE | CM_ROW ) | |
TY_ ( nodeHasCM ) ( node , CM_BLOCK | CM_INLINE ) )
{
TY_ ( Report ) ( doc , table , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
for ( parent = table - > parent ;
parent ! = NULL ;
parent = parent - > parent )
{
if ( node - > tag = = parent - > tag )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , table , node , MISSING_ENDTAG_BEFORE ) ;
TY_ ( UngetToken ) ( doc ) ;
lexer - > istackbase = istackbase ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseTableTag 2 %d... missing EndTag \n " , - - in_parse_table ) ) ;
return NULL ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
}
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( node - > tag - > model & CM_TABLE ) )
{
TY_ ( UngetToken ) ( doc ) ;
TY_ ( Report ) ( doc , table , node , TAG_NOT_ALLOWED_IN ) ;
lexer - > istackbase = istackbase ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseTableTag 3 %d... CM_TABLE \n " , - - in_parse_table ) ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsElement ) ( node ) )
{
TidyParserMemory memory = { 0 } ;
TY_ ( InsertNodeAtEnd ) ( table , node ) ;
memory . identity = TY_ ( ParseTableTag ) ;
memory . original_node = table ;
memory . reentry_node = node ;
memory . register_b_1 = lexer - > exiled ;
pushMemory ( doc , memory ) ;
DEBUG_LOG ( SPRTF ( " <<<Exiting ParseTableTag with a node to parse: %s \n " , node - > element ) ) ;
return node ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* discard unexpected text nodes and end tags */
TY_ ( Report ) ( doc , table , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( Report ) ( doc , table , node , MISSING_ENDTAG_FOR ) ;
lexer - > istackbase = istackbase ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
DEBUG_LOG ( SPRTF ( " <<<Exit ParseTableTag 4 %d... missing end \n " , - - in_parse_table ) ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseText)
* Parses the ` option ` and ` textarea ` tags .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
*/
Node * TY_ ( ParseText ) ( TidyDocImpl * doc , Node * field , GetTokenMode mode )
{
Lexer * lexer = doc - > lexer ;
Node * node ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
lexer - > insert = NULL ; /* defer implicit inline start tags */
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( nodeIsTEXTAREA ( field ) )
mode = Preformatted ;
else
mode = MixedContent ; /* kludge for font tags */
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , mode ) ) ! = NULL )
{
if ( node - > tag = = field - > tag & & node - > type = = EndTag )
{
TY_ ( FreeNode ) ( doc , node ) ;
field - > closed = yes ;
TrimSpaces ( doc , field ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* deal with comments etc. */
if ( InsertMisc ( field , node ) )
continue ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( TY_ ( nodeIsText ) ( node ) )
{
/* only called for 1st child */
if ( field - > content = = NULL & & ! ( mode & Preformatted ) )
TrimSpaces ( doc , field ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( node - > start > = node - > end )
{
2021-07-28 23:45:57 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
2021-08-05 12:18:30 +00:00
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( InsertNodeAtEnd ) ( field , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* for textarea should all cases of < and & be escaped? */
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* discard inline tags e.g. font */
if ( node - > tag
& & node - > tag - > model & CM_INLINE
& & ! ( node - > tag - > model & CM_FIELD ) ) /* #487283 - fix by Lee Passey 25 Jan 02 */
{
TY_ ( Report ) ( doc , field , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/* terminate element on other tags */
if ( ! ( field - > tag - > model & CM_OPT ) )
TY_ ( Report ) ( doc , field , node , MISSING_ENDTAG_BEFORE ) ;
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
TY_ ( UngetToken ) ( doc ) ;
TrimSpaces ( doc , field ) ;
return NULL ;
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
if ( ! ( field - > tag - > model & CM_OPT ) )
TY_ ( Report ) ( doc , field , node , MISSING_ENDTAG_FOR ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
}
2021-08-05 12:18:30 +00:00
/** MARK: TY_(ParseTitle)
* Parses the ` title ` tag .
*
* This is a non - recursing parser . It uses the document ' s parser memory stack
* to send subsequent nodes back to the controller for dispatching to parsers .
* This parser is also re - enterable , so that post - processing can occur after
* such dispatching .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
Node * TY_ ( ParseTitle ) ( TidyDocImpl * doc , Node * title , GetTokenMode ARG_UNUSED ( mode ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
Node * node ;
while ( ( node = TY_ ( GetToken ) ( doc , MixedContent ) ) ! = NULL )
{
if ( node - > tag = = title - > tag & & node - > type = = StartTag
& & cfgBool ( doc , TidyCoerceEndTags ) )
{
TY_ ( Report ) ( doc , title , node , COERCE_TO_ENDTAG ) ;
node - > type = EndTag ;
TY_ ( UngetToken ) ( doc ) ;
continue ;
}
else if ( node - > tag = = title - > tag & & node - > type = = EndTag )
{
TY_ ( FreeNode ) ( doc , node ) ;
title - > closed = yes ;
TrimSpaces ( doc , title ) ;
return NULL ;
}
if ( TY_ ( nodeIsText ) ( node ) )
{
/* only called for 1st child */
if ( title - > content = = NULL )
TrimInitialSpace ( doc , title , node ) ;
if ( node - > start > = node - > end )
{
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
TY_ ( InsertNodeAtEnd ) ( title , node ) ;
continue ;
}
/* deal with comments etc. */
if ( InsertMisc ( title , node ) )
continue ;
/* discard unknown tags */
if ( node - > tag = = NULL )
{
TY_ ( Report ) ( doc , title , node , DISCARDING_UNEXPECTED ) ;
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/* pushback unexpected tokens */
TY_ ( Report ) ( doc , title , node , MISSING_ENDTAG_BEFORE ) ;
TY_ ( UngetToken ) ( doc ) ;
TrimSpaces ( doc , title ) ;
return NULL ;
}
TY_ ( Report ) ( doc , title , node , MISSING_ENDTAG_FOR ) ;
2021-07-28 23:45:57 +00:00
return NULL ;
}
/***************************************************************************/ /*
* * MARK : - Post - Parse Operations
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
2021-08-05 12:18:30 +00:00
* Performs checking of all attributes recursively starting at ` node ` .
2021-07-28 23:45:57 +00:00
*/
2021-08-05 12:18:30 +00:00
static void AttributeChecks ( TidyDocImpl * doc , Node * node )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
Node * next ;
2021-07-28 23:45:57 +00:00
while ( node )
{
2021-08-05 12:18:30 +00:00
next = node - > next ;
if ( TY_ ( nodeIsElement ) ( node ) )
2021-07-28 23:45:57 +00:00
{
2021-08-05 12:18:30 +00:00
if ( node - > tag & & node - > tag - > chkattrs ) /* [i_a]2 fix crash after adding SVG support with alt/unknown tag subtree insertion there */
node - > tag - > chkattrs ( doc , node ) ;
else
TY_ ( CheckAttributes ) ( doc , node ) ;
2021-07-28 23:45:57 +00:00
}
2021-08-05 12:18:30 +00:00
if ( node - > content )
AttributeChecks ( doc , node - > content ) ;
assert ( next ! = node ) ; /* http://tidy.sf.net/issue/1603538 */
node = next ;
2021-07-28 23:45:57 +00:00
}
}
/**
* Encloses naked text in certain elements within ` p ` tags .
*
* < form > , < blockquote > , and < noscript > do not allow # PCDATA in
* HTML 4.01 Strict ( % block ; model instead of % flow ; ) .
*/
static void EncloseBlockText ( TidyDocImpl * doc , Node * node )
{
Node * next ;
Node * block ;
while ( node )
{
next = node - > next ;
if ( node - > content )
EncloseBlockText ( doc , node - > content ) ;
if ( ! ( nodeIsFORM ( node ) | | nodeIsNOSCRIPT ( node ) | |
nodeIsBLOCKQUOTE ( node ) )
| | ! node - > content )
{
node = next ;
continue ;
}
block = node - > content ;
if ( ( TY_ ( nodeIsText ) ( block ) & & ! TY_ ( IsBlank ) ( doc - > lexer , block ) ) | |
( TY_ ( nodeIsElement ) ( block ) & & nodeCMIsOnlyInline ( block ) ) )
{
Node * p = TY_ ( InferredTag ) ( doc , TidyTag_P ) ;
TY_ ( InsertNodeBeforeElement ) ( block , p ) ;
while ( block & &
( ! TY_ ( nodeIsElement ) ( block ) | | nodeCMIsOnlyInline ( block ) ) )
{
Node * tempNext = block - > next ;
TY_ ( RemoveNode ) ( block ) ;
TY_ ( InsertNodeAtEnd ) ( p , block ) ;
block = tempNext ;
}
TrimSpaces ( doc , p ) ;
continue ;
}
node = next ;
}
}
2021-08-05 12:18:30 +00:00
/**
* Encloses all naked body text within ` p ` tags .
*/
static void EncloseBodyText ( TidyDocImpl * doc )
{
Node * node ;
Node * body = TY_ ( FindBody ) ( doc ) ;
if ( ! body )
return ;
node = body - > content ;
while ( node )
{
if ( ( TY_ ( nodeIsText ) ( node ) & & ! TY_ ( IsBlank ) ( doc - > lexer , node ) ) | |
( TY_ ( nodeIsElement ) ( node ) & & nodeCMIsOnlyInline ( node ) ) )
{
Node * p = TY_ ( InferredTag ) ( doc , TidyTag_P ) ;
TY_ ( InsertNodeBeforeElement ) ( node , p ) ;
while ( node & & ( ! TY_ ( nodeIsElement ) ( node ) | | nodeCMIsOnlyInline ( node ) ) )
{
Node * next = node - > next ;
TY_ ( RemoveNode ) ( node ) ;
TY_ ( InsertNodeAtEnd ) ( p , node ) ;
node = next ;
}
TrimSpaces ( doc , p ) ;
continue ;
}
node = node - > next ;
}
}
2021-07-28 23:45:57 +00:00
/**
* Replaces elements that are obsolete with appropriate substitute tags .
*/
static void ReplaceObsoleteElements ( TidyDocImpl * doc , Node * node )
{
Node * next ;
while ( node )
{
next = node - > next ;
/* if (nodeIsDIR(node) || nodeIsMENU(node)) */
/* HTML5 - <menu ... > is no longer obsolete */
if ( nodeIsDIR ( node ) )
TY_ ( CoerceNode ) ( doc , node , TidyTag_UL , yes , yes ) ;
if ( nodeIsXMP ( node ) | | nodeIsLISTING ( node ) | |
( node - > tag & & node - > tag - > id = = TidyTag_PLAINTEXT ) )
TY_ ( CoerceNode ) ( doc , node , TidyTag_PRE , yes , yes ) ;
if ( node - > content )
ReplaceObsoleteElements ( doc , node - > content ) ;
node = next ;
}
}
/***************************************************************************/ /*
* * MARK : - Internal API Implementation
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/** MARK: TY_(CheckNodeIntegrity)
* Is used to perform a node integrity check after parsing an HTML or XML
* document .
* @ note Actual performance of this check can be disabled by defining the
* macro NO_NODE_INTEGRITY_CHECK .
*/
Bool TY_ ( CheckNodeIntegrity ) ( Node * node )
{
# ifndef NO_NODE_INTEGRITY_CHECK
Node * child ;
if ( node - > prev )
{
if ( node - > prev - > next ! = node )
return no ;
}
if ( node - > next )
{
if ( node - > next = = node | | node - > next - > prev ! = node )
return no ;
}
if ( node - > parent )
{
if ( node - > prev = = NULL & & node - > parent - > content ! = node )
return no ;
if ( node - > next = = NULL & & node - > parent - > last ! = node )
return no ;
}
for ( child = node - > content ; child ; child = child - > next )
if ( child - > parent ! = node | | ! TY_ ( CheckNodeIntegrity ) ( child ) )
return no ;
# endif
return yes ;
}
/** MARK: TY_(CoerceNode)
* Transforms a given node to another element , for example , from a < p >
* to a < br > .
*/
void TY_ ( CoerceNode ) ( TidyDocImpl * doc , Node * node , TidyTagId tid , Bool obsolete , Bool unexpected )
{
const Dict * tag = TY_ ( LookupTagDef ) ( tid ) ;
Node * tmp = TY_ ( InferredTag ) ( doc , tag - > id ) ;
if ( obsolete )
TY_ ( Report ) ( doc , node , tmp , OBSOLETE_ELEMENT ) ;
else if ( unexpected )
TY_ ( Report ) ( doc , node , tmp , REPLACING_UNEX_ELEMENT ) ;
else
TY_ ( Report ) ( doc , node , tmp , REPLACING_ELEMENT ) ;
TidyDocFree ( doc , tmp - > element ) ;
TidyDocFree ( doc , tmp ) ;
node - > was = node - > tag ;
node - > tag = tag ;
node - > type = StartTag ;
node - > implicit = yes ;
TidyDocFree ( doc , node - > element ) ;
node - > element = TY_ ( tmbstrdup ) ( doc - > allocator , tag - > name ) ;
}
/** MARK: TY_(DiscardElement)
* Remove node from markup tree and discard it .
*/
Node * TY_ ( DiscardElement ) ( TidyDocImpl * doc , Node * element )
{
Node * next = NULL ;
if ( element )
{
next = element - > next ;
TY_ ( RemoveNode ) ( element ) ;
TY_ ( FreeNode ) ( doc , element ) ;
}
return next ;
}
2011-11-17 02:44:16 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(DropEmptyElements)
* Trims a tree of empty elements recursively , returning the next node .
*/
Node * TY_ ( DropEmptyElements ) ( TidyDocImpl * doc , Node * node )
{
Node * next ;
while ( node )
{
next = node - > next ;
if ( node - > content )
TY_ ( DropEmptyElements ) ( doc , node - > content ) ;
if ( ! TY_ ( nodeIsElement ) ( node ) & &
! ( TY_ ( nodeIsText ) ( node ) & & ! ( node - > start < node - > end ) ) )
{
node = next ;
continue ;
}
next = TY_ ( TrimEmptyElement ) ( doc , node ) ;
node = next ;
}
return node ;
}
2021-07-28 23:45:57 +00:00
/** MARK: TY_(InsertNodeAtStart)
* Insert node into markup tree as the firt element of content of element .
*/
void TY_ ( InsertNodeAtStart ) ( Node * element , Node * node )
{
node - > parent = element ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
if ( element - > content = = NULL )
element - > last = node ;
else
element - > content - > prev = node ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
node - > next = element - > content ;
node - > prev = NULL ;
element - > content = node ;
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/** MARK: TY_(InsertNodeAtEnd)
* Insert node into markup tree as the last element of content of element .
*/
void TY_ ( InsertNodeAtEnd ) ( Node * element , Node * node )
{
node - > parent = element ;
2021-07-30 22:45:18 +00:00
node - > prev = element ? element - > last : NULL ;
2011-11-17 02:44:16 +00:00
2021-07-30 22:45:18 +00:00
if ( element & & element - > last ! = NULL )
2021-07-28 23:45:57 +00:00
element - > last - > next = node ;
else
2021-07-30 22:45:18 +00:00
if ( element )
element - > content = node ;
2011-11-17 02:44:16 +00:00
2021-07-30 22:45:18 +00:00
if ( element )
element - > last = node ;
2021-07-28 23:45:57 +00:00
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/** MARK: TY_(InsertNodeBeforeElement)
* Insert node into markup tree before element .
*/
void TY_ ( InsertNodeBeforeElement ) ( Node * element , Node * node )
{
Node * parent ;
2011-11-17 02:44:16 +00:00
2021-07-30 22:45:18 +00:00
parent = element ? element - > parent : NULL ;
2021-07-28 23:45:57 +00:00
node - > parent = parent ;
node - > next = element ;
2021-07-30 22:45:18 +00:00
node - > prev = element ? element - > prev : NULL ;
if ( element )
element - > prev = node ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
if ( node - > prev )
node - > prev - > next = node ;
2011-11-17 02:44:16 +00:00
2021-07-30 22:45:18 +00:00
if ( parent & & parent - > content = = element )
2021-07-28 23:45:57 +00:00
parent - > content = node ;
}
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/** MARK: TY_(InsertNodeAfterElement)
* Insert node into markup tree after element .
*/
void TY_ ( InsertNodeAfterElement ) ( Node * element , Node * node )
{
Node * parent ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
parent = element - > parent ;
node - > parent = parent ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/* AQ - 13 Jan 2000 fix for parent == NULL */
if ( parent ! = NULL & & parent - > last = = element )
parent - > last = node ;
else
{
node - > next = element - > next ;
/* AQ - 13 Jan 2000 fix for node->next == NULL */
if ( node - > next ! = NULL )
node - > next - > prev = node ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
element - > next = node ;
node - > prev = element ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
/** MARK: TY_(IsBlank)
* Indicates whether or not a text node is blank , meaning that it consists
* of nothing , or a single space .
*/
Bool TY_ ( IsBlank ) ( Lexer * lexer , Node * node )
{
Bool isBlank = TY_ ( nodeIsText ) ( node ) ;
if ( isBlank )
isBlank = ( node - > end = = node - > start | | /* Zero length */
( node - > end = = node - > start + 1 /* or one blank. */
& & lexer - > lexbuf [ node - > start ] = = ' ' ) ) ;
return isBlank ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
/** MARK: TY_(IsJavaScript)
* Indicates whether or not a node is declared as containing javascript
* code .
*/
Bool TY_ ( IsJavaScript ) ( Node * node )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
Bool result = no ;
AttVal * attr ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
if ( node - > attributes = = NULL )
return yes ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
for ( attr = node - > attributes ; attr ; attr = attr - > next )
{
if ( ( attrIsLANGUAGE ( attr ) | | attrIsTYPE ( attr ) )
& & AttrContains ( attr , " javascript " ) )
2011-11-17 02:44:16 +00:00
{
2021-07-28 23:45:57 +00:00
result = yes ;
break ;
2011-11-17 02:44:16 +00:00
}
}
2021-07-28 23:45:57 +00:00
return result ;
2011-11-17 02:44:16 +00:00
}
2021-07-28 23:45:57 +00:00
2021-08-05 12:18:30 +00:00
/** MARK: TY_(IsNewNode)
* Used to check if a node uses CM_NEW , which determines how attributes
* without values should be printed . This was introduced to deal with
* user - defined tags e . g . ColdFusion .
*/
Bool TY_ ( IsNewNode ) ( Node * node )
{
if ( node & & node - > tag )
{
return ( node - > tag - > model & CM_NEW ) ;
}
return yes ;
}
/** MARK: TY_(RemoveNode)
* Extract a node and its children from a markup tree
*/
Node * TY_ ( RemoveNode ) ( Node * node )
{
if ( node - > prev )
node - > prev - > next = node - > next ;
if ( node - > next )
node - > next - > prev = node - > prev ;
if ( node - > parent )
{
if ( node - > parent - > content = = node )
node - > parent - > content = node - > next ;
if ( node - > parent - > last = = node )
node - > parent - > last = node - > prev ;
}
node - > parent = node - > prev = node - > next = NULL ;
return node ;
}
/** MARK: TY_(TrimEmptyElement)
* Trims a single , empty element , returning the next node .
*/
Node * TY_ ( TrimEmptyElement ) ( TidyDocImpl * doc , Node * element )
{
if ( CanPrune ( doc , element ) )
{
if ( element - > type ! = TextNode )
{
doc - > footnotes | = FN_TRIM_EMPTY_ELEMENT ;
TY_ ( Report ) ( doc , element , NULL , TRIM_EMPTY_ELEMENT ) ;
}
return TY_ ( DiscardElement ) ( doc , element ) ;
}
return element - > next ;
}
/** MARK: TY_(XMLPreserveWhiteSpace)
* Indicates whether or not whitespace is to be preserved in XHTML / XML
* documents .
*/
Bool TY_ ( XMLPreserveWhiteSpace ) ( TidyDocImpl * doc , Node * element )
{
AttVal * attribute ;
/* search attributes for xml:space */
for ( attribute = element - > attributes ; attribute ; attribute = attribute - > next )
{
if ( attrIsXML_SPACE ( attribute ) )
{
if ( AttrValueIs ( attribute , " preserve " ) )
return yes ;
return no ;
}
}
if ( element - > element = = NULL )
return no ;
/* kludge for html docs without explicit xml:space attribute */
if ( nodeIsPRE ( element ) | |
nodeIsSCRIPT ( element ) | |
nodeIsSTYLE ( element ) | |
TY_ ( FindParser ) ( doc , element ) = = TY_ ( ParsePre ) )
return yes ;
/* kludge for XSL docs */
if ( TY_ ( tmbstrcasecmp ) ( element - > element , " xsl:text " ) = = 0 )
return yes ;
return no ;
}
/***************************************************************************/ /*
* * MARK : - Internal API Implementation - Main Parsers
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2021-07-28 23:45:57 +00:00
/** MARK: TY_(ParseDocument)
* Parses an HTML document after lexing . It begins by properly configuring
* the overall HTML structure , and subsequently processes all remaining
* nodes .
*/
2011-11-17 02:44:16 +00:00
void TY_ ( ParseDocument ) ( TidyDocImpl * doc )
{
Node * node , * html , * doctype = NULL ;
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
{
if ( node - > type = = XmlDecl )
{
2017-03-19 19:41:51 +00:00
doc - > xmlDetected = yes ;
2011-11-17 02:44:16 +00:00
if ( TY_ ( FindXmlDecl ) ( doc ) & & doc - > root . content )
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , & doc - > root , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
2017-03-19 19:41:51 +00:00
if ( node - > line > 1 | | node - > column ! = 1 )
2011-11-17 02:44:16 +00:00
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , & doc - > root , node , SPACE_PRECEDING_XMLDECL ) ;
2011-11-17 02:44:16 +00:00
}
}
/* deal with comments etc. */
if ( InsertMisc ( & doc - > root , node ) )
continue ;
if ( node - > type = = DocTypeTag )
{
if ( doctype = = NULL )
{
TY_ ( InsertNodeAtEnd ) ( & doc - > root , node ) ;
doctype = node ;
}
else
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , & doc - > root , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
}
continue ;
}
if ( node - > type = = EndTag )
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , & doc - > root , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
if ( node - > type = = StartTag & & nodeIsHTML ( node ) )
{
2021-07-28 23:45:57 +00:00
AttVal * xmlns = TY_ ( AttrGetById ) ( node , TidyAttr_XMLNS ) ;
2011-11-17 02:44:16 +00:00
if ( AttrValueIs ( xmlns , XHTML_NAMESPACE ) )
{
Bool htmlOut = cfgBool ( doc , TidyHtmlOut ) ;
doc - > lexer - > isvoyager = yes ; /* Unless plain HTML */
TY_ ( SetOptionBool ) ( doc , TidyXhtmlOut , ! htmlOut ) ; /* is specified, output*/
TY_ ( SetOptionBool ) ( doc , TidyXmlOut , ! htmlOut ) ; /* will be XHTML. */
/* adjust other config options, just as in config.c */
if ( ! htmlOut )
{
2017-05-12 11:30:20 +00:00
TY_ ( SetOptionBool ) ( doc , TidyUpperCaseTags , no ) ;
2017-05-11 22:12:56 +00:00
TY_ ( SetOptionInt ) ( doc , TidyUpperCaseAttrs , no ) ;
2011-11-17 02:44:16 +00:00
}
}
}
if ( node - > type ! = StartTag | | ! nodeIsHTML ( node ) )
{
TY_ ( UngetToken ) ( doc ) ;
html = TY_ ( InferredTag ) ( doc , TidyTag_HTML ) ;
}
else
html = node ;
2015-04-08 16:45:31 +00:00
/*\
2021-07-28 23:45:57 +00:00
* # 72 , avoid MISSING_DOCTYPE if show - body - only .
2015-04-08 16:45:31 +00:00
* # 191 , also if - - doctype omit , that is TidyDoctypeOmit
2016-02-01 18:44:30 +00:00
* # 342 , adjust tags to html4 - - if not ' auto ' or ' html5 '
2015-04-08 16:45:31 +00:00
\ */
2021-07-28 23:45:57 +00:00
if ( ! TY_ ( FindDocType ) ( doc ) )
2015-04-08 16:45:31 +00:00
{
ulong dtmode = cfg ( doc , TidyDoctypeMode ) ;
2016-02-01 18:44:30 +00:00
if ( ( dtmode ! = TidyDoctypeOmit ) & & ! showingBodyOnly ( doc ) )
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , NULL , NULL , MISSING_DOCTYPE ) ;
2016-02-01 18:44:30 +00:00
if ( ( dtmode ! = TidyDoctypeAuto ) & & ( dtmode ! = TidyDoctypeHtml5 ) )
{
/*\
* Issue # 342 - if not doctype ' auto ' , or ' html5 '
* then reset mode htm4 - - parsing
\ */
TY_ ( AdjustTags ) ( doc ) ; /* Dynamically modify the tags table to html4-- mode */
}
2015-04-08 16:45:31 +00:00
}
2011-11-17 02:44:16 +00:00
TY_ ( InsertNodeAtEnd ) ( & doc - > root , html ) ;
2021-07-28 23:45:57 +00:00
ParseHTMLWithNode ( doc , html ) ;
2011-11-17 02:44:16 +00:00
break ;
}
/* do this before any more document fixes */
if ( cfg ( doc , TidyAccessibilityCheckLevel ) > 0 )
TY_ ( AccessibilityChecks ) ( doc ) ;
if ( ! TY_ ( FindHTML ) ( doc ) )
{
/* a later check should complain if <body> is empty */
html = TY_ ( InferredTag ) ( doc , TidyTag_HTML ) ;
TY_ ( InsertNodeAtEnd ) ( & doc - > root , html ) ;
2021-07-28 23:45:57 +00:00
ParseHTMLWithNode ( doc , html ) ;
2011-11-17 02:44:16 +00:00
}
2020-10-11 15:51:06 +00:00
node = TY_ ( FindTITLE ) ( doc ) ;
if ( ! node )
2011-11-17 02:44:16 +00:00
{
Node * head = TY_ ( FindHEAD ) ( doc ) ;
2015-02-01 13:16:17 +00:00
/* #72, avoid MISSING_TITLE_ELEMENT if show-body-only (but allow InsertNodeAtEnd to avoid new warning) */
if ( ! showingBodyOnly ( doc ) )
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , head , NULL , MISSING_TITLE_ELEMENT ) ;
2015-02-01 13:16:17 +00:00
}
2011-11-17 02:44:16 +00:00
TY_ ( InsertNodeAtEnd ) ( head , TY_ ( InferredTag ) ( doc , TidyTag_TITLE ) ) ;
}
2020-10-11 15:51:06 +00:00
else if ( ! node - > content & & ! showingBodyOnly ( doc ) )
{
/* Is #839 - warn node is blank in HTML5 */
if ( TY_ ( IsHTML5Mode ) ( doc ) )
{
TY_ ( Report ) ( doc , node , NULL , BLANK_TITLE_ELEMENT ) ;
}
}
2011-11-17 02:44:16 +00:00
AttributeChecks ( doc , & doc - > root ) ;
ReplaceObsoleteElements ( doc , & doc - > root ) ;
TY_ ( DropEmptyElements ) ( doc , & doc - > root ) ;
CleanSpaces ( doc , & doc - > root ) ;
if ( cfgBool ( doc , TidyEncloseBodyText ) )
EncloseBodyText ( doc ) ;
if ( cfgBool ( doc , TidyEncloseBlockText ) )
EncloseBlockText ( doc , & doc - > root ) ;
}
2021-07-28 23:45:57 +00:00
/** MARK: TY_(ParseXMLElement)
* Parses the given XML element .
*/
2011-11-17 02:44:16 +00:00
static void ParseXMLElement ( TidyDocImpl * doc , Node * element , GetTokenMode mode )
{
Lexer * lexer = doc - > lexer ;
Node * node ;
/* if node is pre or has xml:space="preserve" then do so */
if ( TY_ ( XMLPreserveWhiteSpace ) ( doc , element ) )
mode = Preformatted ;
while ( ( node = TY_ ( GetToken ) ( doc , mode ) ) ! = NULL )
{
if ( node - > type = = EndTag & &
node - > element & & element - > element & &
TY_ ( tmbstrcmp ) ( node - > element , element - > element ) = = 0 )
{
TY_ ( FreeNode ) ( doc , node ) ;
element - > closed = yes ;
break ;
}
/* discard unexpected end tags */
if ( node - > type = = EndTag )
{
if ( element )
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , UNEXPECTED_ENDTAG_IN ) ;
2011-11-17 02:44:16 +00:00
else
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , element , node , UNEXPECTED_ENDTAG_ERR ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/* parse content on seeing start tag */
if ( node - > type = = StartTag )
ParseXMLElement ( doc , node , mode ) ;
TY_ ( InsertNodeAtEnd ) ( element , node ) ;
}
/*
if first child is text then trim initial space and
delete text node if it is empty .
*/
node = element - > content ;
if ( TY_ ( nodeIsText ) ( node ) & & mode ! = Preformatted )
{
if ( lexer - > lexbuf [ node - > start ] = = ' ' )
{
node - > start + + ;
if ( node - > start > = node - > end )
TY_ ( DiscardElement ) ( doc , node ) ;
}
}
/*
if last child is text then trim final space and
delete the text node if it is empty
*/
node = element - > last ;
if ( TY_ ( nodeIsText ) ( node ) & & mode ! = Preformatted )
{
if ( lexer - > lexbuf [ node - > end - 1 ] = = ' ' )
{
node - > end - - ;
if ( node - > start > = node - > end )
TY_ ( DiscardElement ) ( doc , node ) ;
}
}
}
2021-07-28 23:45:57 +00:00
/** MARK: TY_(ParseXMLDocument)
* Parses the document using Tidy ' s XML parser .
*/
2011-11-17 02:44:16 +00:00
void TY_ ( ParseXMLDocument ) ( TidyDocImpl * doc )
{
Node * node , * doctype = NULL ;
TY_ ( SetOptionBool ) ( doc , TidyXmlTags , yes ) ;
2017-03-19 19:41:51 +00:00
doc - > xmlDetected = yes ;
2011-11-17 02:44:16 +00:00
while ( ( node = TY_ ( GetToken ) ( doc , IgnoreWhitespace ) ) ! = NULL )
{
/* discard unexpected end tags */
if ( node - > type = = EndTag )
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , NULL , node , UNEXPECTED_ENDTAG ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
continue ;
}
/* deal with comments etc. */
if ( InsertMisc ( & doc - > root , node ) )
continue ;
if ( node - > type = = DocTypeTag )
{
if ( doctype = = NULL )
{
TY_ ( InsertNodeAtEnd ) ( & doc - > root , node ) ;
doctype = node ;
}
else
{
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , & doc - > root , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
}
continue ;
}
if ( node - > type = = StartEndTag )
{
TY_ ( InsertNodeAtEnd ) ( & doc - > root , node ) ;
continue ;
}
/* if start tag then parse element's content */
if ( node - > type = = StartTag )
{
TY_ ( InsertNodeAtEnd ) ( & doc - > root , node ) ;
ParseXMLElement ( doc , node , IgnoreWhitespace ) ;
continue ;
}
2017-09-04 15:24:48 +00:00
TY_ ( Report ) ( doc , & doc - > root , node , DISCARDING_UNEXPECTED ) ;
2011-11-17 02:44:16 +00:00
TY_ ( FreeNode ) ( doc , node ) ;
}
/* ensure presence of initial <?xml version="1.0"?> */
if ( cfgBool ( doc , TidyXmlDecl ) )
TY_ ( FixXmlDecl ) ( doc ) ;
}
2015-02-01 13:16:17 +00:00
2021-07-28 23:45:57 +00:00
2011-11-17 02:44:16 +00:00
/*
* local variables :
* mode : c
* indent - tabs - mode : nil
* c - basic - offset : 4
* eval : ( c - set - offset ' substatement - open 0 )
* end :
*/