Merge pull request #981 from htacg/iterate

Documentation and Recursion
This commit is contained in:
Jim Derry 2021-07-29 06:22:48 -04:00 committed by GitHub
commit db847e6e1c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 3698 additions and 1509 deletions

View file

@ -1585,11 +1585,16 @@ void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
*/
void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
{
Stack *stack = TY_(newStack)(doc, 16);
Node *next;
tmbchar indent_buf[ 32 ];
uint indent;
while (node)
{
next = node->next;
if ( nodeIsBLOCKQUOTE(node) && node->implicit )
{
indent = 1;
@ -1602,19 +1607,27 @@ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
StripOnlyChild( doc, node );
}
if (node->content)
TY_(BQ2Div)( doc, node->content );
TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
2*indent);
RenameElem( doc, node, TidyTag_DIV );
TY_(AddStyleProperty)(doc, node, indent_buf );
if (node->content)
{
TY_(push)(stack, next);
node = node->content;
continue;
}
}
else if (node->content)
TY_(BQ2Div)( doc, node->content );
{
TY_(push)(stack, next);
node = node->content;
continue;
}
node = node->next;
node = next ? next : TY_(pop)(stack);
}
}
@ -2736,30 +2749,42 @@ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
*/
static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
{
Node *next;
while (node)
{
next = node->next; /* get 'next' now , in case the node is moved */
/* dbg_show_node(doc, node, 0, indent); */
if (nodeIsSTYLE(node))
{
if (fix)
{
TY_(RemoveNode)(node); /* unhook style node from body */
TY_(InsertNodeAtEnd)(head, node); /* add to end of head */
TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
}
else
{
TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
}
}
else if (node->content)
{
StyleToHead(doc, head, node->content, fix, indent + 1);
}
node = next; /* process the 'next', if any */
}
Stack *stack = TY_(newStack)(doc, 16);
Node *next;
while (node)
{
next = node->next;
if (nodeIsSTYLE(node))
{
if (fix)
{
TY_(RemoveNode)(node); /* unhook style node from body */
TY_(InsertNodeAtEnd)(head, node); /* add to end of head */
TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
}
else
{
TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
}
}
else if (node->content)
{
TY_(push)(stack, next);
node = node->content;
indent++;
continue;
}
if (next)
node = next;
else
{
node = TY_(pop)(stack);
indent--;
}
}
}

View file

@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str )
return 0;
}
/*
node->type is one of these:
#define TextNode 1
#define StartTag 2
#define EndTag 3
#define StartEndTag 4
*/
Lexer* TY_(NewLexer)( TidyDocImpl* doc )
{
Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
}
}
#endif
/* this is no good ;=((
if (node && doc && doc->lexer) {
if (node == doc->lexer->token) {
doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer );
}
}
----------------- */
while ( node )
{
Node* next = node->next;
@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc)
return NULL;
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
/****************************************************************************//*
** MARK: - Node Stack
***************************************************************************/
/**
* Create a new stack with a given starting capacity. If memory allocation
* fails, then the allocator will panic the program automatically.
*/
Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity)
{
Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack));
stack->top = -1;
stack->capacity = capacity;
stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**));
stack->allocator = doc->allocator;
return stack;
}
/**
* Increase the stack size. This will be called automatically when the
* current stack is full. If memory allocation fails, then the allocator
* will panic the program automatically.
*/
void TY_(growStack)(Stack *stack)
{
uint new_capacity = stack->capacity * 2;
Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity);
memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) );
TidyFree(stack->allocator, stack->firstNode);
stack->firstNode = firstNode;
stack->capacity = new_capacity;
}
/**
* Stack is full when top is equal to the last index.
*/
Bool TY_(stackFull)(Stack *stack)
{
return stack->top == stack->capacity - 1;
}
/**
* Stack is empty when top is equal to -1
*/
Bool TY_(stackEmpty)(Stack *stack)
{
return stack->top == -1;
}
/**
* Push an item to the stack.
*/
void TY_(push)(Stack *stack, Node *node)
{
if (TY_(stackFull)(stack))
TY_(growStack)(stack);
if (node)
stack->firstNode[++stack->top] = node;
}
/**
* Pop an item from the stack.
*/
Node* TY_(pop)(Stack *stack)
{
return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
}
/**
* Peek at the stack.
*/
FUNC_UNUSED Node* TY_(peek)(Stack *stack)
{
return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
}
/**
* Frees the stack when done.
*/
void TY_(freeStack)(Stack *stack)
{
TidyFree( stack->allocator, stack->firstNode );
stack->top = -1;
stack->capacity = 0;
stack->firstNode = NULL;
stack->allocator = NULL;
}

View file

@ -1,33 +1,46 @@
#ifndef __LEXER_H__
#define __LEXER_H__
/* lexer.h -- Lexer for html parser
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Given an input source, it returns a sequence of tokens.
GetToken(source) gets the next token
UngetToken(source) provides one level undo
The tags include an attribute list:
- linked list of attribute/value nodes
- each node has 2 NULL-terminated strings.
- entities are replaced in attribute values
white space is compacted if not in preformatted mode
If not in preformatted mode then leading white space
is discarded and subsequent white space sequences
compacted to single space characters.
If XmlTags is no then Tag names are folded to upper
case and attribute names to lower case.
Not yet done:
- Doctype subset and marked sections
*/
/**************************************************************************//**
* @file
* Lexer for HTML and XML Parsers.
*
* Given an input source, it returns a sequence of tokens.
*
* GetToken(source) gets the next token
* UngetToken(source) provides one level undo
*
* The tags include an attribute list:
*
* - linked list of attribute/value nodes
* - each node has 2 NULL-terminated strings.
* - entities are replaced in attribute values
*
* white space is compacted if not in preformatted mode
* If not in preformatted mode then leading white space
* is discarded and subsequent white space sequences
* compacted to single space characters.
*
* If XmlTags is no then Tag names are folded to upper
* case and attribute names to lower case.
*
* Not yet done:
* - Doctype subset and marked sections
*
* @author HTACG, et al (consult git log)
*
* @copyright
* (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG.
* See tidy.h for the copyright notice.
* @par
* All Rights Reserved.
* @par
* See `tidy.h` for the complete license.
*
* @date Additional updates: consult git log
*
******************************************************************************/
#ifdef __cplusplus
extern "C" {
@ -35,8 +48,23 @@ extern "C" {
#include "forward.h"
/* lexer character types
*/
/** @addtogroup internal_api */
/** @{ */
/***************************************************************************//**
** @defgroup lexer_h HTML and XML Lexing
**
** These functions and structures form the internal API for document
** lexing.
**
** @{
******************************************************************************/
/**
* Lexer character types.
*/
#define digit 1u
#define letter 2u
#define namechar 4u
@ -47,8 +75,9 @@ extern "C" {
#define digithex 128u
/* node->type is one of these values
*/
/**
* node->type is one of these values
*/
typedef enum
{
RootNode,
@ -68,9 +97,9 @@ typedef enum
} NodeType;
/* lexer GetToken states
*/
/**
* Lexer GetToken() states.
*/
typedef enum
{
LEX_CONTENT,
@ -88,7 +117,10 @@ typedef enum
LEX_XMLDECL
} LexerState;
/* ParseDocTypeDecl state constants */
/**
* ParseDocTypeDecl state constants.
*/
typedef enum
{
DT_INTERMEDIATE,
@ -98,67 +130,44 @@ typedef enum
DT_INTSUBSET
} ParseDocTypeDeclState;
/* content model shortcut encoding
Descriptions are tentative.
*/
/**
* Content model shortcut encoding.
* Descriptions are tentative.
*/
#define CM_UNKNOWN 0
/* Elements with no content. Map to HTML specification. */
#define CM_EMPTY (1 << 0)
/* Elements that appear outside of "BODY". */
#define CM_HTML (1 << 1)
/* Elements that can appear within HEAD. */
#define CM_HEAD (1 << 2)
/* HTML "block" elements. */
#define CM_BLOCK (1 << 3)
/* HTML "inline" elements. */
#define CM_INLINE (1 << 4)
/* Elements that mark list item ("LI"). */
#define CM_LIST (1 << 5)
/* Elements that mark definition list item ("DL", "DT"). */
#define CM_DEFLIST (1 << 6)
/* Elements that can appear inside TABLE. */
#define CM_TABLE (1 << 7)
/* Used for "THEAD", "TFOOT" or "TBODY". */
#define CM_ROWGRP (1 << 8)
/* Used for "TD", "TH" */
#define CM_ROW (1 << 9)
/* Elements whose content must be protected against white space movement.
Includes some elements that can found in forms. */
#define CM_FIELD (1 << 10)
/* Used to avoid propagating inline emphasis inside some elements
such as OBJECT or APPLET. */
#define CM_OBJECT (1 << 11)
/* Elements that allows "PARAM". */
#define CM_PARAM (1 << 12)
/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
#define CM_FRAMES (1 << 13)
/* Heading elements (h1, h2, ...). */
#define CM_HEADING (1 << 14)
/* Elements with an optional end tag. */
#define CM_OPT (1 << 15)
/* Elements that use "align" attribute for vertical position. */
#define CM_IMG (1 << 16)
/* Elements with inline and block model. Used to avoid calling InlineDup. */
#define CM_MIXED (1 << 17)
/* Elements whose content needs to be indented only if containing one
CM_BLOCK element. */
#define CM_NO_INDENT (1 << 18)
/* Elements that are obsolete (such as "dir", "menu"). */
#define CM_OBSOLETE (1 << 19)
/* User defined elements. Used to determine how attributes without value
should be printed. */
#define CM_NEW (1 << 20)
/* Elements that cannot be omitted. */
#define CM_OMITST (1 << 21)
#define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */
#define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */
#define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */
#define CM_BLOCK (1 << 3) /**< HTML "block" elements. */
#define CM_INLINE (1 << 4) /**< HTML "inline" elements. */
#define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */
#define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */
#define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */
#define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */
#define CM_ROW (1 << 9) /**< Used for "TD", "TH" */
#define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */
#define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */
#define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */
#define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
#define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */
#define CM_OPT (1 << 15) /**< Elements with an optional end tag. */
#define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */
#define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */
#define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */
#define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */
#define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */
#define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */
/* If the document uses just HTML 2.0 tags and attributes described
** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
** If there are proprietary tags and attributes then describe it as
** HTML Proprietary. If it includes the xml-lang or xmlns attributes
** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
** flavors of Voyager (strict, loose or frameset).
*/
/**
* If the document uses just HTML 2.0 tags and attributes described
* it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
* If there are proprietary tags and attributes then describe it as
* HTML Proprietary. If it includes the xml-lang or xmlns attributes
* but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
* flavors of Voyager (strict, loose or frameset).
*/
/* unknown */
#define xxxx 0u
@ -220,8 +229,10 @@ typedef enum
/* all proprietary types */
#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
/* Linked list of class names and styles
*/
/**
* Linked list of class names and styles
*/
struct _Style;
typedef struct _Style TagStyle;
@ -234,8 +245,9 @@ struct _Style
};
/* Linked list of style properties
*/
/**
* Linked list of style properties
*/
struct _StyleProp;
typedef struct _StyleProp StyleProp;
@ -247,11 +259,9 @@ struct _StyleProp
};
/* Attribute/Value linked list node
*/
/**
* Attribute/Value linked list node
*/
struct _AttVal
{
AttVal* next;
@ -264,93 +274,89 @@ struct _AttVal
};
/*
Mosaic handles inlines via a separate stack from other elements
We duplicate this to recover from inline markup errors such as:
<i>italic text
<p>more italic text</b> normal text
which for compatibility with Mosaic is mapped to:
<i>italic text</i>
<p><i>more italic text</i> normal text
Note that any inline end tag pop's the effect of the current
inline start tag, so that </b> pop's <i> in the above example.
/**
* Mosaic handles inlines via a separate stack from other elements
* We duplicate this to recover from inline markup errors such as:
* ~~~
* <i>italic text
* <p>more italic text</b> normal text
* ~~~
* which for compatibility with Mosaic is mapped to:
* ~~~
* <i>italic text</i>
* <p><i>more italic text</i> normal text
* ~~~
* Note that any inline end tag pop's the effect of the current
* inline start tag, so that `</b>` pop's `<i>` in the above example.
*/
struct _IStack
{
IStack* next;
const Dict* tag; /* tag's dictionary definition */
tmbstr element; /* name (NULL for text nodes) */
const Dict* tag; /**< tag's dictionary definition */
tmbstr element; /**< name (NULL for text nodes) */
AttVal* attributes;
};
/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
** etc. etc.
*/
/**
* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.
*/
struct _Node
{
Node* parent; /* tree structure */
Node* parent; /**< tree structure */
Node* prev;
Node* next;
Node* content;
Node* last;
AttVal* attributes;
const Dict* was; /* old tag when it was changed */
const Dict* tag; /* tag's dictionary definition */
const Dict* was; /**< old tag when it was changed */
const Dict* tag; /**< tag's dictionary definition */
tmbstr element; /* name (NULL for text nodes) */
tmbstr element; /**< name (NULL for text nodes) */
uint start; /* start of span onto text array */
uint end; /* end of span onto text array */
NodeType type; /* TextNode, StartTag, EndTag etc. */
uint start; /**< start of span onto text array */
uint end; /**< end of span onto text array */
NodeType type; /**< TextNode, StartTag, EndTag etc. */
uint line; /* current line of document */
uint column; /* current column of document */
uint line; /**< current line of document */
uint column; /**< current column of document */
Bool closed; /* true if closed by explicit end tag */
Bool implicit; /* true if inferred */
Bool linebreak; /* true if followed by a line break */
Bool closed; /**< true if closed by explicit end tag */
Bool implicit; /**< true if inferred */
Bool linebreak; /**< true if followed by a line break */
};
/*
The following are private to the lexer
Use NewLexer() to create a lexer, and
FreeLexer() to free it.
*/
/**
* The following are private to the lexer.
* Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it.
*/
struct _Lexer
{
uint lines; /* lines seen */
uint columns; /* at start of current token */
Bool waswhite; /* used to collapse contiguous white space */
Bool pushed; /* true after token has been pushed back */
Bool insertspace; /* when space is moved after end tag */
Bool excludeBlocks; /* Netscape compatibility */
Bool exiled; /* true if moved out of table */
Bool isvoyager; /* true if xmlns attribute on html element */
uint versions; /* bit vector of HTML versions */
uint doctype; /* version as given by doctype (if any) */
uint versionEmitted; /* version of doctype emitted */
Bool bad_doctype; /* e.g. if html or PUBLIC is missing */
uint txtstart; /* start of current node */
uint txtend; /* end of current node */
LexerState state; /* state of lexer's finite state machine */
uint lines; /**< lines seen */
uint columns; /**< at start of current token */
Bool waswhite; /**< used to collapse contiguous white space */
Bool pushed; /**< true after token has been pushed back */
Bool insertspace; /**< when space is moved after end tag */
Bool excludeBlocks; /**< Netscape compatibility */
Bool exiled; /**< true if moved out of table */
Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */
uint versions; /**< bit vector of HTML versions */
uint doctype; /**< version as given by doctype (if any) */
uint versionEmitted; /**< version of doctype emitted */
Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */
uint txtstart; /**< start of current node */
uint txtend; /**< end of current node */
LexerState state; /**< state of lexer's finite state machine */
Node* token; /* last token returned by GetToken() */
Node* itoken; /* last duplicate inline returned by GetToken() */
Node* root; /* remember root node of the document */
Node* parent; /* remember parent node for CDATA elements */
Bool seenEndBody; /* true if a </body> tag has been encountered */
Bool seenEndHtml; /* true if a </html> tag has been encountered */
Node* token; /**< last token returned by GetToken() */
Node* itoken; /**< last duplicate inline returned by GetToken() */
Node* root; /**< remember root node of the document */
Node* parent; /**< remember parent node for CDATA elements */
Bool seenEndBody; /**< true if a `</body>` tag has been encountered */
Bool seenEndHtml; /**< true if a `</html>` tag has been encountered */
/*
Lexer character buffer
@ -361,33 +367,57 @@ struct _Lexer
lexsize must be reset for each file.
*/
tmbstr lexbuf; /* MB character buffer */
uint lexlength; /* allocated */
uint lexsize; /* used */
tmbstr lexbuf; /**< MB character buffer */
uint lexlength; /**< allocated */
uint lexsize; /**< used */
/* Inline stack for compatibility with Mosaic */
Node* inode; /* for deferring text node */
IStack* insert; /* for inferring inline tags */
Node* inode; /**< for deferring text node */
IStack* insert; /**< for inferring inline tags */
IStack* istack;
uint istacklength; /* allocated */
uint istacksize; /* used */
uint istackbase; /* start of frame */
uint istacklength; /**< allocated */
uint istacksize; /**< used */
uint istackbase; /**< start of frame */
TagStyle *styles; /* used for cleaning up presentation markup */
TagStyle *styles; /**< used for cleaning up presentation markup */
TidyAllocator* allocator; /* allocator */
TidyAllocator* allocator; /**< allocator */
};
/* Lexer Functions
*/
/**
* modes for GetToken()
*
* MixedContent -- for elements which don't accept PCDATA
* Preformatted -- white space preserved as is
* IgnoreMarkup -- for CDATA elements such as script, style
*/
typedef enum
{
IgnoreWhitespace,
MixedContent,
Preformatted,
IgnoreMarkup,
OtherNamespace,
CdataContent
} GetTokenMode;
/* choose what version to use for new doctype */
/** @name Lexer Functions
* @{
*/
/**
* Choose what version to use for new doctype
*/
TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc );
/* everything is allowed in proprietary version of HTML */
/* this is handled here rather than in the tag/attr dicts */
/**
* Everything is allowed in proprietary version of HTML.
* This is handled here rather than in the tag/attr dicts
*/
TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
TY_PRIVATE Bool TY_(IsWhite)(uint c);
@ -399,7 +429,6 @@ TY_PRIVATE Bool TY_(IsNamechar)(uint c);
TY_PRIVATE Bool TY_(IsXMLLetter)(uint c);
TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c);
/* Bool IsLower(uint c); */
TY_PRIVATE Bool TY_(IsUpper)(uint c);
TY_PRIVATE uint TY_(ToLower)(uint c);
TY_PRIVATE uint TY_(ToUpper)(uint c);
@ -407,60 +436,82 @@ TY_PRIVATE uint TY_(ToUpper)(uint c);
TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc );
TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc );
/* store character c as UTF-8 encoded byte stream */
/**
* Store character c as UTF-8 encoded byte stream
*/
TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c );
/*
Used for elements and text nodes
element name is NULL for text nodes
start and end are offsets into lexbuf
which contains the textual content of
all elements in the parse tree.
parent and content allow traversal
of the parse tree in any direction.
attributes are represented as a linked
list of AttVal nodes which hold the
strings for attribute/value pairs.
/**
* Used for elements and text nodes.
* - Element name is NULL for text nodes.
* - start and end are offsets into lexbuf,
* which contains the textual content of
* all elements in the parse tree.
* - parent and content allow traversal
* of the parse tree in any direction.
* - attributes are represented as a linked
* list of AttVal nodes which hold the
* strings for attribute/value pairs.
*/
TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
/* used to clone heading nodes when split by an <HR> */
/**
* Used to clone heading nodes when split by an `<HR>`
*/
TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
/* free node's attributes */
/**
* Free node's attributes
*/
TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
/* doesn't repair attribute list linkage */
/**
* Doesn't repair attribute list linkage
*/
TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
/* detach attribute from node */
/**
* Detach attribute from node
*/
TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr );
/* detach attribute from node then free it
*/
/**
* Detach attribute from node then free it.
*/
TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
/*
Free document nodes by iterating through peers and recursing
through children. Set next to NULL before calling FreeNode()
to avoid freeing peer nodes. Doesn't patch up prev/next links.
/**
* Free document nodes by iterating through peers and recursing
* through children. Set `next` to `NULL` before calling `FreeNode()`
* to avoid freeing peer nodes. Doesn't patch up prev/next links.
*/
TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer );
/* used for creating preformatted text from Word2000 */
/**
* Used for creating preformatted text from Word2000.
*/
TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer );
/* used for adding a &nbsp; for Word2000 */
/**
* Used for adding a &nbsp; for Word2000.
*/
TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
/* TY_PRIVATE void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
/* find element */
TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc );
@ -468,10 +519,16 @@ TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc);
TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
/* Returns containing block element, if any */
/**
* Returns containing block element, if any
*/
TY_PRIVATE Node* TY_(FindContainer)( Node* node );
/* add meta element for Tidy */
/**
* Add meta element for Tidy.
*/
TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc );
TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc );
@ -485,118 +542,209 @@ TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
/* fixup doctype if missing */
/**
* Fixup doctype if missing.
*/
TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc );
/* ensure XML document starts with <?xml version="1.0"?> */
/* add encoding attribute if not using ASCII or UTF-8 output */
/**
* Ensure XML document starts with <?xml version="1.0"?>,and
* add encoding attribute if not using ASCII or UTF-8 output.
*/
TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc );
/*
modes for GetToken()
MixedContent -- for elements which don't accept PCDATA
Preformatted -- white space preserved as is
IgnoreMarkup -- for CDATA elements such as script, style
*/
typedef enum
{
IgnoreWhitespace,
MixedContent,
Preformatted,
IgnoreMarkup,
OtherNamespace,
CdataContent
} GetTokenMode;
TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
TY_PRIVATE void TY_(InitMap)(void);
/* create a new attribute */
/**
* Create a new attribute.
*/
TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
/* create a new attribute with given name and value */
/**
* Create a new attribute with given name and value.
*/
TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
int delim );
/* insert attribute at the end of attribute list of a node */
/**
* Insert attribute at the end of attribute list of a node.
*/
TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
/* insert attribute at the start of attribute list of a node */
/**
* Insert attribute at the start of attribute list of a node.
*/
TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
/*************************************
In-line Stack functions
*************************************/
/** @}
* @name Inline Stack Functions
* @{
*/
/* duplicate attributes */
/**
* Duplicate attributes.
*/
TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
/*
push a copy of an inline node onto stack
but don't push if implicit or OBJECT or APPLET
(implicit tags are ones generated from the istack)
One issue arises with pushing inlines when
the tag is already pushed. For instance:
<p><em>text
<p><em>more text
Shouldn't be mapped to
<p><em>text</em></p>
<p><em><em>more text</em></em>
*/
/**
* Push a copy of an inline node onto stack, but don't push if
* implicit or OBJECT or APPLET (implicit tags are ones generated
* from the istack).
*
* One issue arises with pushing inlines when the tag is already pushed.
* For instance:
* ~~~
* <p><em>text
* <p><em>more text
* ~~~
* Shouldn't be mapped to
* ~~~
* <p><em>text</em></p>
* <p><em><em>more text</em></em>
* ~~~
*/
TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node );
/* pop inline stack */
/**
* Pop inline stack.
*/
TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node );
TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
/*
This has the effect of inserting "missing" inline
elements around the contents of blocklevel elements
such as P, TD, TH, DIV, PRE etc. This procedure is
called at the start of ParseBlock. when the inline
stack is not empty, as will be the case in:
<i><h1>italic heading</h1></i>
which is then treated as equivalent to
<h1><i>italic heading</i></h1>
This is implemented by setting the lexer into a mode
where it gets tokens from the inline stack rather than
from the input stream.
*/
/**
* This has the effect of inserting "missing" inline elements around the
* contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This
* procedure is called at the start of `ParseBlock`, when the inline
* stack is not empty, as will be the case in:
* ~~~
* <i><h1>italic heading</h1></i>
* ~~~
* which is then treated as equivalent to
* ~~~
* <h1><i>italic heading</i></h1>
* ~~~
* This is implemented by setting the lexer into a mode where it gets
* tokens from the inline stack rather than from the input stream.
*/
TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
/*
defer duplicates when entering a table or other
element where the inlines shouldn't be duplicated
*/
/**
* Fefer duplicates when entering a table or other
* element where the inlines shouldn't be duplicated.
*/
TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc );
/* stack manipulation for inline elements */
/**
* Stack manipulation for inline elements
*/
TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
/** @}
* @name Generic stack of nodes.
* @{
*/
/**
* This typedef represents a stack of addresses to nodes. Tidy uses these to
* try to limit recursion by pushing nodes to a stack when possible instead
* of recursing.
*/
typedef struct _Stack {
int top; /**< Current top position. */
unsigned capacity; /**< Current capacity. Can be expanded. */
Node **firstNode; /** A pointer to the first pointer to a Node in an array of node addresses. */
TidyAllocator* allocator; /**< Tidy's allocator, used at instantiation and expanding. */
} Stack;
/**
* Create a new stack with a given starting capacity. If memory allocation
* fails, then the allocator will panic the program automatically.
*/
TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity);
/**
* Increase the stack size. This will be called automatically when the
* current stack is full. If memory allocation fails, then the allocator
* will panic the program automatically.
*/
TY_PRIVATE void TY_(growStack)(Stack *stack);
/**
* Stack is full when top is equal to the last index.
*/
TY_PRIVATE Bool TY_(stackFull)(Stack *stack);
/**
* Stack is empty when top is equal to -1
*/
TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack);
/**
* Push an item to the stack.
*/
TY_PRIVATE void TY_(push)(Stack *stack, Node *node);
/**
* Pop an item from the stack.
*/
TY_PRIVATE Node* TY_(pop)(Stack *stack);
/**
* Peek at the stack.
*/
TY_PRIVATE Node* TY_(peek)(Stack *stack);
/**
* Frees the stack when done.
*/
TY_PRIVATE void TY_(freeStack)(Stack *stack);
/** @}
*/
#ifdef __cplusplus
}
#endif
/** @} end parser_h group */
/** @} end internal_api group */
#endif /* __LEXER_H__ */

File diff suppressed because it is too large Load diff

View file

@ -41,6 +41,74 @@
******************************************************************************/
/**
* The parsers keeps track of their states with the states defined here, and
* use these symbols when pushing to the stack so that they can later recreate
* their environments when re-entered.
*/
typedef enum {
/* Universal states. */
STATE_INITIAL, /**< This is the initial state for every parser. */
STATE_COMPLETE, /**< Complete! */
STATE_PARSE_TAG,
STATE_PARSE_TAG_DONE,
/* ParseHTML states. */
STATE_PRE_HEAD, /**< In this state, we've not detected head yet. */
STATE_PRE_BODY, /**< In this state, we'll consider frames vs. body. */
STATE_PARSE_BODY, /**< In this state, we can parse the body. */
STATE_PARSE_HEAD, /**< In this state, we will setup head for parsing. */
STATE_PARSE_HEAD_DONE, /**< Resume here after parsing head. */
STATE_PARSE_NOFRAMES, /**< In this state, we can parse noframes content. */
STATE_PARSE_NOFRAMES_DONE, /**< In this state, we can restore more state. */
STATE_PARSE_FRAMESET, /**< In this state, we will parse frameset content. */
STATE_PARSE_FRAMESET_DONE, /**< We need to cleanup some things after parsing frameset. */
} parserState;
/**
* This typedef represents the state of a parser when it enters and exits.
* When the parser needs to finish work on the way back up the stack, it will
* push one of these records to the stack, and it will pop a record from the
* stack upon re-entry.
*/
typedef struct _TidyParserMemory
{
Parser *identity; /**< Which parser pushed this record? */
Node *original_node; /**< Originally provided node at entry. */
Node *reentry_node; /**< A node a parser might want to save. */
GetTokenMode reentry_mode; /**< The mode to use for the next node. */
parserState reentry_state; /**< State to set during re-entry. */
GetTokenMode mode; /**< The caller will peek at this value to get the correct mode. */
} TidyParserMemory;
/**
* This typedef represents a stack of parserState. The Tidy document has its
* own instance of this.
*/
typedef struct _TidyParserStack
{
TidyParserMemory* content; /**< A state record. */
TidyAllocator* allocator; /**< The allocator used for creating. */
uint size; /**< Current size of the stack. */
int top; /**< Top of the stack. */
} TidyParserStack;
/**
* Allocates and initializes the parser's stack. TidyCreate will perform
* this automatically.
*/
void TY_(InitParserStack)( TidyDocImpl* doc );
/**
* Frees the parser's stack when done. TidyRelease will perform this
* automatically.
*/
void TY_(FreeParserStack)( TidyDocImpl* doc );
/**
* Is used to perform a node integrity check recursively after parsing
* an HTML or XML document.
@ -96,7 +164,7 @@ TY_PRIVATE Node *TY_(RemoveNode)(Node *node);
/**
* Remove node from markup tree and discard it.
* @param doc The Tidy document from which to discarb the node.
* @param doc The Tidy document from which to discard the node.
* @param element The node to discard.
* @returns Returns the next node.
*/
@ -202,4 +270,3 @@ TY_PRIVATE void TY_(ParseXMLDocument)( TidyDocImpl* doc );
/** @} end internal_api group */
#endif /* __PARSER_H__ */

View file

@ -168,7 +168,7 @@ static CheckAttribs CheckHTML;
\*/
static Dict tag_defs[] =
{
{ TidyTag_UNKNOWN, "unknown!", VERS_UNKNOWN, NULL, (0), NULL, NULL },
{ TidyTag_UNKNOWN, "unknown!", VERS_UNKNOWN, NULL, (0), NULL, NULL },
/* W3C defined elements */
{ TidyTag_A, "a", VERS_ELEM_A, &TY_(W3CAttrsFor_A)[0], (CM_INLINE|CM_BLOCK|CM_MIXED), TY_(ParseBlock), NULL }, /* Issue #167 & #169 - default HTML5 */
@ -332,7 +332,7 @@ static Dict tag_defs[] =
{ TidyTag_WBR, "wbr", VERS_ELEM_WBR, &TY_(W3CAttrsFor_WBR)[0], (CM_INLINE|CM_EMPTY), TY_(ParseEmpty), NULL },
/* this must be the final entry */
{ (TidyTagId)0, NULL, 0, NULL, (0), NULL, NULL }
{ (TidyTagId)0, NULL, 0, NULL, (0), NULL, NULL }
};
static uint tagsHash(ctmbstr s)

View file

@ -61,8 +61,13 @@ typedef enum
/** This typedef describes a function to be used to parse HTML of a Tidy tag.
** @param doc The Tidy document.
** @param node The node being parsed.
** @param mode The GetTokenMode to be used for parsing the node contents.
** @param popStack A flag indicating that we are re-entering this parser, and
** it should restore a state from the stack.
*/
typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
typedef Node* (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode, Bool popStack );
/** This typedef describes a function be be used to check the attributes

View file

@ -16,6 +16,7 @@
#include "pprint.h"
#include "access.h"
#include "message.h"
#include "parser.h"
#ifndef MAX
#define MAX(a,b) (((a) > (b))?(a):(b))
@ -41,19 +42,20 @@ struct _TidyDocImpl
Lexer* lexer;
/* Config + Markup Declarations */
TidyConfigImpl config;
TidyTagImpl tags;
TidyAttribImpl attribs;
TidyAccessImpl access;
TidyMutedMessages muted;
TidyConfigImpl config;
TidyTagImpl tags;
TidyAttribImpl attribs;
TidyAccessImpl access;
TidyMutedMessages muted;
/* The Pretty Print buffer */
TidyPrintImpl pprint;
TidyPrintImpl pprint;
/* I/O */
StreamIn* docIn;
StreamOut* docOut;
StreamOut* errout;
TidyReportFilter reportFilter;
TidyReportCallback reportCallback;
TidyMessageCallback messageCallback;
@ -62,6 +64,8 @@ struct _TidyDocImpl
TidyConfigChangeCallback pConfigChangeCallback;
TidyPPProgress progressCallback;
TidyParserStack stack;
/* Parse + Repair Results */
uint optionErrors;
uint errors;

View file

@ -112,6 +112,7 @@ TidyDocImpl* tidyDocCreate( TidyAllocator *allocator )
TY_(InitAttrs)( doc );
TY_(InitConfig)( doc );
TY_(InitPrintBuf)( doc );
TY_(InitParserStack)( doc );
/* Set the locale for tidy's output. This both configures
** LibTidy to use the environment's locale as well as the
@ -172,6 +173,7 @@ void tidyDocRelease( TidyDocImpl* doc )
* to determine which hash is to be used, so free it last.
\*/
TY_(FreeLexer)( doc );
TY_(FreeParserStack)( doc );
TidyDocFree( doc, doc );
}
}