Merge pull request #981 from htacg/iterate

Documentation and Recursion
This commit is contained in:
Jim Derry 2021-07-29 06:22:48 -04:00 committed by GitHub
commit db847e6e1c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 3698 additions and 1509 deletions

View file

@ -1585,11 +1585,16 @@ void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
*/ */
void TY_(BQ2Div)( TidyDocImpl* doc, Node *node ) void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
{ {
Stack *stack = TY_(newStack)(doc, 16);
Node *next;
tmbchar indent_buf[ 32 ]; tmbchar indent_buf[ 32 ];
uint indent; uint indent;
while (node) while (node)
{ {
next = node->next;
if ( nodeIsBLOCKQUOTE(node) && node->implicit ) if ( nodeIsBLOCKQUOTE(node) && node->implicit )
{ {
indent = 1; indent = 1;
@ -1602,19 +1607,27 @@ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
StripOnlyChild( doc, node ); StripOnlyChild( doc, node );
} }
if (node->content)
TY_(BQ2Div)( doc, node->content );
TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem", TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
2*indent); 2*indent);
RenameElem( doc, node, TidyTag_DIV ); RenameElem( doc, node, TidyTag_DIV );
TY_(AddStyleProperty)(doc, node, indent_buf ); TY_(AddStyleProperty)(doc, node, indent_buf );
if (node->content)
{
TY_(push)(stack, next);
node = node->content;
continue;
}
} }
else if (node->content) else if (node->content)
TY_(BQ2Div)( doc, node->content ); {
TY_(push)(stack, next);
node = node->content;
continue;
}
node = node->next; node = next ? next : TY_(pop)(stack);
} }
} }
@ -2736,11 +2749,13 @@ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
*/ */
static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent) static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
{ {
Stack *stack = TY_(newStack)(doc, 16);
Node *next; Node *next;
while (node) while (node)
{ {
next = node->next; /* get 'next' now , in case the node is moved */ next = node->next;
/* dbg_show_node(doc, node, 0, indent); */
if (nodeIsSTYLE(node)) if (nodeIsSTYLE(node))
{ {
if (fix) if (fix)
@ -2756,9 +2771,19 @@ static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int
} }
else if (node->content) else if (node->content)
{ {
StyleToHead(doc, head, node->content, fix, indent + 1); TY_(push)(stack, next);
node = node->content;
indent++;
continue;
}
if (next)
node = next;
else
{
node = TY_(pop)(stack);
indent--;
} }
node = next; /* process the 'next', if any */
} }
} }

View file

@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str )
return 0; return 0;
} }
/*
node->type is one of these:
#define TextNode 1
#define StartTag 2
#define EndTag 3
#define StartEndTag 4
*/
Lexer* TY_(NewLexer)( TidyDocImpl* doc ) Lexer* TY_(NewLexer)( TidyDocImpl* doc )
{ {
Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) ); Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
} }
} }
#endif #endif
/* this is no good ;=((
if (node && doc && doc->lexer) {
if (node == doc->lexer->token) {
doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer );
}
}
----------------- */
while ( node ) while ( node )
{ {
Node* next = node->next; Node* next = node->next;
@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc)
return NULL; return NULL;
} }
/*
* local variables: /****************************************************************************//*
* mode: c ** MARK: - Node Stack
* indent-tabs-mode: nil ***************************************************************************/
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end: /**
* Create a new stack with a given starting capacity. If memory allocation
* fails, then the allocator will panic the program automatically.
*/ */
Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity)
{
Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack));
stack->top = -1;
stack->capacity = capacity;
stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**));
stack->allocator = doc->allocator;
return stack;
}
/**
* Increase the stack size. This will be called automatically when the
* current stack is full. If memory allocation fails, then the allocator
* will panic the program automatically.
*/
void TY_(growStack)(Stack *stack)
{
uint new_capacity = stack->capacity * 2;
Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity);
memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) );
TidyFree(stack->allocator, stack->firstNode);
stack->firstNode = firstNode;
stack->capacity = new_capacity;
}
/**
* Stack is full when top is equal to the last index.
*/
Bool TY_(stackFull)(Stack *stack)
{
return stack->top == stack->capacity - 1;
}
/**
* Stack is empty when top is equal to -1
*/
Bool TY_(stackEmpty)(Stack *stack)
{
return stack->top == -1;
}
/**
* Push an item to the stack.
*/
void TY_(push)(Stack *stack, Node *node)
{
if (TY_(stackFull)(stack))
TY_(growStack)(stack);
if (node)
stack->firstNode[++stack->top] = node;
}
/**
* Pop an item from the stack.
*/
Node* TY_(pop)(Stack *stack)
{
return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
}
/**
* Peek at the stack.
*/
FUNC_UNUSED Node* TY_(peek)(Stack *stack)
{
return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
}
/**
* Frees the stack when done.
*/
void TY_(freeStack)(Stack *stack)
{
TidyFree( stack->allocator, stack->firstNode );
stack->top = -1;
stack->capacity = 0;
stack->firstNode = NULL;
stack->allocator = NULL;
}

View file

@ -1,33 +1,46 @@
#ifndef __LEXER_H__ #ifndef __LEXER_H__
#define __LEXER_H__ #define __LEXER_H__
/* lexer.h -- Lexer for html parser
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University /**************************************************************************//**
See tidy.h for the copyright notice. * @file
* Lexer for HTML and XML Parsers.
Given an input source, it returns a sequence of tokens. *
* Given an input source, it returns a sequence of tokens.
GetToken(source) gets the next token *
UngetToken(source) provides one level undo * GetToken(source) gets the next token
* UngetToken(source) provides one level undo
The tags include an attribute list: *
* The tags include an attribute list:
- linked list of attribute/value nodes *
- each node has 2 NULL-terminated strings. * - linked list of attribute/value nodes
- entities are replaced in attribute values * - each node has 2 NULL-terminated strings.
* - entities are replaced in attribute values
white space is compacted if not in preformatted mode *
If not in preformatted mode then leading white space * white space is compacted if not in preformatted mode
is discarded and subsequent white space sequences * If not in preformatted mode then leading white space
compacted to single space characters. * is discarded and subsequent white space sequences
* compacted to single space characters.
If XmlTags is no then Tag names are folded to upper *
case and attribute names to lower case. * If XmlTags is no then Tag names are folded to upper
* case and attribute names to lower case.
Not yet done: *
- Doctype subset and marked sections * Not yet done:
*/ * - Doctype subset and marked sections
*
* @author HTACG, et al (consult git log)
*
* @copyright
* (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG.
* See tidy.h for the copyright notice.
* @par
* All Rights Reserved.
* @par
* See `tidy.h` for the complete license.
*
* @date Additional updates: consult git log
*
******************************************************************************/
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -35,8 +48,23 @@ extern "C" {
#include "forward.h" #include "forward.h"
/* lexer character types /** @addtogroup internal_api */
*/ /** @{ */
/***************************************************************************//**
** @defgroup lexer_h HTML and XML Lexing
**
** These functions and structures form the internal API for document
** lexing.
**
** @{
******************************************************************************/
/**
* Lexer character types.
*/
#define digit 1u #define digit 1u
#define letter 2u #define letter 2u
#define namechar 4u #define namechar 4u
@ -47,8 +75,9 @@ extern "C" {
#define digithex 128u #define digithex 128u
/* node->type is one of these values /**
*/ * node->type is one of these values
*/
typedef enum typedef enum
{ {
RootNode, RootNode,
@ -68,9 +97,9 @@ typedef enum
} NodeType; } NodeType;
/**
/* lexer GetToken states * Lexer GetToken() states.
*/ */
typedef enum typedef enum
{ {
LEX_CONTENT, LEX_CONTENT,
@ -88,7 +117,10 @@ typedef enum
LEX_XMLDECL LEX_XMLDECL
} LexerState; } LexerState;
/* ParseDocTypeDecl state constants */
/**
* ParseDocTypeDecl state constants.
*/
typedef enum typedef enum
{ {
DT_INTERMEDIATE, DT_INTERMEDIATE,
@ -98,67 +130,44 @@ typedef enum
DT_INTSUBSET DT_INTSUBSET
} ParseDocTypeDeclState; } ParseDocTypeDeclState;
/* content model shortcut encoding
Descriptions are tentative. /**
*/ * Content model shortcut encoding.
* Descriptions are tentative.
*/
#define CM_UNKNOWN 0 #define CM_UNKNOWN 0
/* Elements with no content. Map to HTML specification. */ #define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */
#define CM_EMPTY (1 << 0) #define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */
/* Elements that appear outside of "BODY". */ #define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */
#define CM_HTML (1 << 1) #define CM_BLOCK (1 << 3) /**< HTML "block" elements. */
/* Elements that can appear within HEAD. */ #define CM_INLINE (1 << 4) /**< HTML "inline" elements. */
#define CM_HEAD (1 << 2) #define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */
/* HTML "block" elements. */ #define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */
#define CM_BLOCK (1 << 3) #define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */
/* HTML "inline" elements. */ #define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */
#define CM_INLINE (1 << 4) #define CM_ROW (1 << 9) /**< Used for "TD", "TH" */
/* Elements that mark list item ("LI"). */ #define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */
#define CM_LIST (1 << 5) #define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */
/* Elements that mark definition list item ("DL", "DT"). */ #define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */
#define CM_DEFLIST (1 << 6) #define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
/* Elements that can appear inside TABLE. */ #define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */
#define CM_TABLE (1 << 7) #define CM_OPT (1 << 15) /**< Elements with an optional end tag. */
/* Used for "THEAD", "TFOOT" or "TBODY". */ #define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */
#define CM_ROWGRP (1 << 8) #define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */
/* Used for "TD", "TH" */ #define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */
#define CM_ROW (1 << 9) #define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */
/* Elements whose content must be protected against white space movement. #define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */
Includes some elements that can found in forms. */ #define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */
#define CM_FIELD (1 << 10)
/* Used to avoid propagating inline emphasis inside some elements
such as OBJECT or APPLET. */
#define CM_OBJECT (1 << 11)
/* Elements that allows "PARAM". */
#define CM_PARAM (1 << 12)
/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
#define CM_FRAMES (1 << 13)
/* Heading elements (h1, h2, ...). */
#define CM_HEADING (1 << 14)
/* Elements with an optional end tag. */
#define CM_OPT (1 << 15)
/* Elements that use "align" attribute for vertical position. */
#define CM_IMG (1 << 16)
/* Elements with inline and block model. Used to avoid calling InlineDup. */
#define CM_MIXED (1 << 17)
/* Elements whose content needs to be indented only if containing one
CM_BLOCK element. */
#define CM_NO_INDENT (1 << 18)
/* Elements that are obsolete (such as "dir", "menu"). */
#define CM_OBSOLETE (1 << 19)
/* User defined elements. Used to determine how attributes without value
should be printed. */
#define CM_NEW (1 << 20)
/* Elements that cannot be omitted. */
#define CM_OMITST (1 << 21)
/* If the document uses just HTML 2.0 tags and attributes described
** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. /**
** If there are proprietary tags and attributes then describe it as * If the document uses just HTML 2.0 tags and attributes described
** HTML Proprietary. If it includes the xml-lang or xmlns attributes * it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the * If there are proprietary tags and attributes then describe it as
** flavors of Voyager (strict, loose or frameset). * HTML Proprietary. If it includes the xml-lang or xmlns attributes
*/ * but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
* flavors of Voyager (strict, loose or frameset).
*/
/* unknown */ /* unknown */
#define xxxx 0u #define xxxx 0u
@ -220,8 +229,10 @@ typedef enum
/* all proprietary types */ /* all proprietary types */
#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
/* Linked list of class names and styles
*/ /**
* Linked list of class names and styles
*/
struct _Style; struct _Style;
typedef struct _Style TagStyle; typedef struct _Style TagStyle;
@ -234,8 +245,9 @@ struct _Style
}; };
/* Linked list of style properties /**
*/ * Linked list of style properties
*/
struct _StyleProp; struct _StyleProp;
typedef struct _StyleProp StyleProp; typedef struct _StyleProp StyleProp;
@ -247,11 +259,9 @@ struct _StyleProp
}; };
/**
* Attribute/Value linked list node
/* Attribute/Value linked list node */
*/
struct _AttVal struct _AttVal
{ {
AttVal* next; AttVal* next;
@ -264,93 +274,89 @@ struct _AttVal
}; };
/**
/* * Mosaic handles inlines via a separate stack from other elements
Mosaic handles inlines via a separate stack from other elements * We duplicate this to recover from inline markup errors such as:
We duplicate this to recover from inline markup errors such as: * ~~~
* <i>italic text
<i>italic text * <p>more italic text</b> normal text
<p>more italic text</b> normal text * ~~~
* which for compatibility with Mosaic is mapped to:
which for compatibility with Mosaic is mapped to: * ~~~
* <i>italic text</i>
<i>italic text</i> * <p><i>more italic text</i> normal text
<p><i>more italic text</i> normal text * ~~~
* Note that any inline end tag pop's the effect of the current
Note that any inline end tag pop's the effect of the current * inline start tag, so that `</b>` pop's `<i>` in the above example.
inline start tag, so that </b> pop's <i> in the above example.
*/ */
struct _IStack struct _IStack
{ {
IStack* next; IStack* next;
const Dict* tag; /* tag's dictionary definition */ const Dict* tag; /**< tag's dictionary definition */
tmbstr element; /* name (NULL for text nodes) */ tmbstr element; /**< name (NULL for text nodes) */
AttVal* attributes; AttVal* attributes;
}; };
/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, /**
** etc. etc. * HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.
*/ */
struct _Node struct _Node
{ {
Node* parent; /* tree structure */ Node* parent; /**< tree structure */
Node* prev; Node* prev;
Node* next; Node* next;
Node* content; Node* content;
Node* last; Node* last;
AttVal* attributes; AttVal* attributes;
const Dict* was; /* old tag when it was changed */ const Dict* was; /**< old tag when it was changed */
const Dict* tag; /* tag's dictionary definition */ const Dict* tag; /**< tag's dictionary definition */
tmbstr element; /* name (NULL for text nodes) */ tmbstr element; /**< name (NULL for text nodes) */
uint start; /* start of span onto text array */ uint start; /**< start of span onto text array */
uint end; /* end of span onto text array */ uint end; /**< end of span onto text array */
NodeType type; /* TextNode, StartTag, EndTag etc. */ NodeType type; /**< TextNode, StartTag, EndTag etc. */
uint line; /* current line of document */ uint line; /**< current line of document */
uint column; /* current column of document */ uint column; /**< current column of document */
Bool closed; /* true if closed by explicit end tag */ Bool closed; /**< true if closed by explicit end tag */
Bool implicit; /* true if inferred */ Bool implicit; /**< true if inferred */
Bool linebreak; /* true if followed by a line break */ Bool linebreak; /**< true if followed by a line break */
}; };
/* /**
The following are private to the lexer * The following are private to the lexer.
Use NewLexer() to create a lexer, and * Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it.
FreeLexer() to free it. */
*/
struct _Lexer struct _Lexer
{ {
uint lines; /* lines seen */ uint lines; /**< lines seen */
uint columns; /* at start of current token */ uint columns; /**< at start of current token */
Bool waswhite; /* used to collapse contiguous white space */ Bool waswhite; /**< used to collapse contiguous white space */
Bool pushed; /* true after token has been pushed back */ Bool pushed; /**< true after token has been pushed back */
Bool insertspace; /* when space is moved after end tag */ Bool insertspace; /**< when space is moved after end tag */
Bool excludeBlocks; /* Netscape compatibility */ Bool excludeBlocks; /**< Netscape compatibility */
Bool exiled; /* true if moved out of table */ Bool exiled; /**< true if moved out of table */
Bool isvoyager; /* true if xmlns attribute on html element */ Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */
uint versions; /* bit vector of HTML versions */ uint versions; /**< bit vector of HTML versions */
uint doctype; /* version as given by doctype (if any) */ uint doctype; /**< version as given by doctype (if any) */
uint versionEmitted; /* version of doctype emitted */ uint versionEmitted; /**< version of doctype emitted */
Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */
uint txtstart; /* start of current node */ uint txtstart; /**< start of current node */
uint txtend; /* end of current node */ uint txtend; /**< end of current node */
LexerState state; /* state of lexer's finite state machine */ LexerState state; /**< state of lexer's finite state machine */
Node* token; /* last token returned by GetToken() */ Node* token; /**< last token returned by GetToken() */
Node* itoken; /* last duplicate inline returned by GetToken() */ Node* itoken; /**< last duplicate inline returned by GetToken() */
Node* root; /* remember root node of the document */ Node* root; /**< remember root node of the document */
Node* parent; /* remember parent node for CDATA elements */ Node* parent; /**< remember parent node for CDATA elements */
Bool seenEndBody; /* true if a </body> tag has been encountered */ Bool seenEndBody; /**< true if a `</body>` tag has been encountered */
Bool seenEndHtml; /* true if a </html> tag has been encountered */ Bool seenEndHtml; /**< true if a `</html>` tag has been encountered */
/* /*
Lexer character buffer Lexer character buffer
@ -361,33 +367,57 @@ struct _Lexer
lexsize must be reset for each file. lexsize must be reset for each file.
*/ */
tmbstr lexbuf; /* MB character buffer */ tmbstr lexbuf; /**< MB character buffer */
uint lexlength; /* allocated */ uint lexlength; /**< allocated */
uint lexsize; /* used */ uint lexsize; /**< used */
/* Inline stack for compatibility with Mosaic */ /* Inline stack for compatibility with Mosaic */
Node* inode; /* for deferring text node */ Node* inode; /**< for deferring text node */
IStack* insert; /* for inferring inline tags */ IStack* insert; /**< for inferring inline tags */
IStack* istack; IStack* istack;
uint istacklength; /* allocated */ uint istacklength; /**< allocated */
uint istacksize; /* used */ uint istacksize; /**< used */
uint istackbase; /* start of frame */ uint istackbase; /**< start of frame */
TagStyle *styles; /* used for cleaning up presentation markup */ TagStyle *styles; /**< used for cleaning up presentation markup */
TidyAllocator* allocator; /* allocator */ TidyAllocator* allocator; /**< allocator */
}; };
/* Lexer Functions /**
*/ * modes for GetToken()
*
* MixedContent -- for elements which don't accept PCDATA
* Preformatted -- white space preserved as is
* IgnoreMarkup -- for CDATA elements such as script, style
*/
typedef enum
{
IgnoreWhitespace,
MixedContent,
Preformatted,
IgnoreMarkup,
OtherNamespace,
CdataContent
} GetTokenMode;
/* choose what version to use for new doctype */
/** @name Lexer Functions
* @{
*/
/**
* Choose what version to use for new doctype
*/
TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc ); TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc );
/* everything is allowed in proprietary version of HTML */
/* this is handled here rather than in the tag/attr dicts */
/**
* Everything is allowed in proprietary version of HTML.
* This is handled here rather than in the tag/attr dicts
*/
TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers ); TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
TY_PRIVATE Bool TY_(IsWhite)(uint c); TY_PRIVATE Bool TY_(IsWhite)(uint c);
@ -399,7 +429,6 @@ TY_PRIVATE Bool TY_(IsNamechar)(uint c);
TY_PRIVATE Bool TY_(IsXMLLetter)(uint c); TY_PRIVATE Bool TY_(IsXMLLetter)(uint c);
TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c); TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c);
/* Bool IsLower(uint c); */
TY_PRIVATE Bool TY_(IsUpper)(uint c); TY_PRIVATE Bool TY_(IsUpper)(uint c);
TY_PRIVATE uint TY_(ToLower)(uint c); TY_PRIVATE uint TY_(ToLower)(uint c);
TY_PRIVATE uint TY_(ToUpper)(uint c); TY_PRIVATE uint TY_(ToUpper)(uint c);
@ -407,60 +436,82 @@ TY_PRIVATE uint TY_(ToUpper)(uint c);
TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc ); TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc );
TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc ); TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc );
/* store character c as UTF-8 encoded byte stream */
/**
* Store character c as UTF-8 encoded byte stream
*/
TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c ); TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c );
/*
Used for elements and text nodes
element name is NULL for text nodes
start and end are offsets into lexbuf
which contains the textual content of
all elements in the parse tree.
parent and content allow traversal /**
of the parse tree in any direction. * Used for elements and text nodes.
attributes are represented as a linked * - Element name is NULL for text nodes.
list of AttVal nodes which hold the * - start and end are offsets into lexbuf,
strings for attribute/value pairs. * which contains the textual content of
* all elements in the parse tree.
* - parent and content allow traversal
* of the parse tree in any direction.
* - attributes are represented as a linked
* list of AttVal nodes which hold the
* strings for attribute/value pairs.
*/ */
TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer ); TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
/* used to clone heading nodes when split by an <HR> */ /**
* Used to clone heading nodes when split by an `<HR>`
*/
TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element ); TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
/* free node's attributes */
/**
* Free node's attributes
*/
TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ); TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
/* doesn't repair attribute list linkage */
/**
* Doesn't repair attribute list linkage
*/
TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av ); TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
/* detach attribute from node */
/**
* Detach attribute from node
*/
TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr ); TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr );
/* detach attribute from node then free it
*/ /**
* Detach attribute from node then free it.
*/
TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr ); TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
/*
Free document nodes by iterating through peers and recursing /**
through children. Set next to NULL before calling FreeNode() * Free document nodes by iterating through peers and recursing
to avoid freeing peer nodes. Doesn't patch up prev/next links. * through children. Set `next` to `NULL` before calling `FreeNode()`
* to avoid freeing peer nodes. Doesn't patch up prev/next links.
*/ */
TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node ); TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer ); TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer );
/* used for creating preformatted text from Word2000 */
/**
* Used for creating preformatted text from Word2000.
*/
TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer ); TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer );
/* used for adding a &nbsp; for Word2000 */
/**
* Used for adding a &nbsp; for Word2000.
*/
TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt ); TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
/* TY_PRIVATE void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
/* find element */ TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc ); TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc ); TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc ); TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc );
@ -468,10 +519,16 @@ TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc);
TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc ); TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc); TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
/* Returns containing block element, if any */
/**
* Returns containing block element, if any
*/
TY_PRIVATE Node* TY_(FindContainer)( Node* node ); TY_PRIVATE Node* TY_(FindContainer)( Node* node );
/* add meta element for Tidy */
/**
* Add meta element for Tidy.
*/
TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc ); TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc );
TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc ); TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc );
@ -485,118 +542,209 @@ TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc ); TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
/* fixup doctype if missing */ /**
* Fixup doctype if missing.
*/
TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc ); TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc );
/* ensure XML document starts with <?xml version="1.0"?> */
/* add encoding attribute if not using ASCII or UTF-8 output */ /**
* Ensure XML document starts with <?xml version="1.0"?>,and
* add encoding attribute if not using ASCII or UTF-8 output.
*/
TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc ); TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id); TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc ); TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc );
/*
modes for GetToken()
MixedContent -- for elements which don't accept PCDATA
Preformatted -- white space preserved as is
IgnoreMarkup -- for CDATA elements such as script, style
*/
typedef enum
{
IgnoreWhitespace,
MixedContent,
Preformatted,
IgnoreMarkup,
OtherNamespace,
CdataContent
} GetTokenMode;
TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode ); TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
TY_PRIVATE void TY_(InitMap)(void); TY_PRIVATE void TY_(InitMap)(void);
/* create a new attribute */ /**
* Create a new attribute.
*/
TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc ); TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
/* create a new attribute with given name and value */
/**
* Create a new attribute with given name and value.
*/
TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value, TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
int delim ); int delim );
/* insert attribute at the end of attribute list of a node */
/**
* Insert attribute at the end of attribute list of a node.
*/
TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av ); TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
/* insert attribute at the start of attribute list of a node */ /**
* Insert attribute at the start of attribute list of a node.
*/
TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av ); TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
/*************************************
In-line Stack functions /** @}
*************************************/ * @name Inline Stack Functions
* @{
*/
/* duplicate attributes */ /**
* Duplicate attributes.
*/
TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs ); TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
/*
push a copy of an inline node onto stack
but don't push if implicit or OBJECT or APPLET
(implicit tags are ones generated from the istack)
One issue arises with pushing inlines when /**
the tag is already pushed. For instance: * Push a copy of an inline node onto stack, but don't push if
* implicit or OBJECT or APPLET (implicit tags are ones generated
<p><em>text * from the istack).
<p><em>more text *
* One issue arises with pushing inlines when the tag is already pushed.
Shouldn't be mapped to * For instance:
* ~~~
<p><em>text</em></p> * <p><em>text
<p><em><em>more text</em></em> * <p><em>more text
*/ * ~~~
* Shouldn't be mapped to
* ~~~
* <p><em>text</em></p>
* <p><em><em>more text</em></em>
* ~~~
*/
TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node ); TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node );
/* pop inline stack */
/**
* Pop inline stack.
*/
TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node ); TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node );
TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node ); TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node ); TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
/*
This has the effect of inserting "missing" inline
elements around the contents of blocklevel elements
such as P, TD, TH, DIV, PRE etc. This procedure is
called at the start of ParseBlock. when the inline
stack is not empty, as will be the case in:
<i><h1>italic heading</h1></i> /**
* This has the effect of inserting "missing" inline elements around the
which is then treated as equivalent to * contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This
* procedure is called at the start of `ParseBlock`, when the inline
<h1><i>italic heading</i></h1> * stack is not empty, as will be the case in:
* ~~~
This is implemented by setting the lexer into a mode * <i><h1>italic heading</h1></i>
where it gets tokens from the inline stack rather than * ~~~
from the input stream. * which is then treated as equivalent to
*/ * ~~~
* <h1><i>italic heading</i></h1>
* ~~~
* This is implemented by setting the lexer into a mode where it gets
* tokens from the inline stack rather than from the input stream.
*/
TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node ); TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
/*
defer duplicates when entering a table or other /**
element where the inlines shouldn't be duplicated * Fefer duplicates when entering a table or other
*/ * element where the inlines shouldn't be duplicated.
*/
TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc ); TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc ); TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc );
/* stack manipulation for inline elements */ /**
* Stack manipulation for inline elements
*/
TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node ); TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element ); TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
/** @}
* @name Generic stack of nodes.
* @{
*/
/**
* This typedef represents a stack of addresses to nodes. Tidy uses these to
* try to limit recursion by pushing nodes to a stack when possible instead
* of recursing.
*/
typedef struct _Stack {
int top; /**< Current top position. */
unsigned capacity; /**< Current capacity. Can be expanded. */
Node **firstNode; /** A pointer to the first pointer to a Node in an array of node addresses. */
TidyAllocator* allocator; /**< Tidy's allocator, used at instantiation and expanding. */
} Stack;
/**
* Create a new stack with a given starting capacity. If memory allocation
* fails, then the allocator will panic the program automatically.
*/
TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity);
/**
* Increase the stack size. This will be called automatically when the
* current stack is full. If memory allocation fails, then the allocator
* will panic the program automatically.
*/
TY_PRIVATE void TY_(growStack)(Stack *stack);
/**
* Stack is full when top is equal to the last index.
*/
TY_PRIVATE Bool TY_(stackFull)(Stack *stack);
/**
* Stack is empty when top is equal to -1
*/
TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack);
/**
* Push an item to the stack.
*/
TY_PRIVATE void TY_(push)(Stack *stack, Node *node);
/**
* Pop an item from the stack.
*/
TY_PRIVATE Node* TY_(pop)(Stack *stack);
/**
* Peek at the stack.
*/
TY_PRIVATE Node* TY_(peek)(Stack *stack);
/**
* Frees the stack when done.
*/
TY_PRIVATE void TY_(freeStack)(Stack *stack);
/** @}
*/
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
/** @} end parser_h group */
/** @} end internal_api group */
#endif /* __LEXER_H__ */ #endif /* __LEXER_H__ */

File diff suppressed because it is too large Load diff

View file

@ -41,6 +41,74 @@
******************************************************************************/ ******************************************************************************/
/**
* The parsers keeps track of their states with the states defined here, and
* use these symbols when pushing to the stack so that they can later recreate
* their environments when re-entered.
*/
typedef enum {
/* Universal states. */
STATE_INITIAL, /**< This is the initial state for every parser. */
STATE_COMPLETE, /**< Complete! */
STATE_PARSE_TAG,
STATE_PARSE_TAG_DONE,
/* ParseHTML states. */
STATE_PRE_HEAD, /**< In this state, we've not detected head yet. */
STATE_PRE_BODY, /**< In this state, we'll consider frames vs. body. */
STATE_PARSE_BODY, /**< In this state, we can parse the body. */
STATE_PARSE_HEAD, /**< In this state, we will setup head for parsing. */
STATE_PARSE_HEAD_DONE, /**< Resume here after parsing head. */
STATE_PARSE_NOFRAMES, /**< In this state, we can parse noframes content. */
STATE_PARSE_NOFRAMES_DONE, /**< In this state, we can restore more state. */
STATE_PARSE_FRAMESET, /**< In this state, we will parse frameset content. */
STATE_PARSE_FRAMESET_DONE, /**< We need to cleanup some things after parsing frameset. */
} parserState;
/**
* This typedef represents the state of a parser when it enters and exits.
* When the parser needs to finish work on the way back up the stack, it will
* push one of these records to the stack, and it will pop a record from the
* stack upon re-entry.
*/
typedef struct _TidyParserMemory
{
Parser *identity; /**< Which parser pushed this record? */
Node *original_node; /**< Originally provided node at entry. */
Node *reentry_node; /**< A node a parser might want to save. */
GetTokenMode reentry_mode; /**< The mode to use for the next node. */
parserState reentry_state; /**< State to set during re-entry. */
GetTokenMode mode; /**< The caller will peek at this value to get the correct mode. */
} TidyParserMemory;
/**
* This typedef represents a stack of parserState. The Tidy document has its
* own instance of this.
*/
typedef struct _TidyParserStack
{
TidyParserMemory* content; /**< A state record. */
TidyAllocator* allocator; /**< The allocator used for creating. */
uint size; /**< Current size of the stack. */
int top; /**< Top of the stack. */
} TidyParserStack;
/**
* Allocates and initializes the parser's stack. TidyCreate will perform
* this automatically.
*/
void TY_(InitParserStack)( TidyDocImpl* doc );
/**
* Frees the parser's stack when done. TidyRelease will perform this
* automatically.
*/
void TY_(FreeParserStack)( TidyDocImpl* doc );
/** /**
* Is used to perform a node integrity check recursively after parsing * Is used to perform a node integrity check recursively after parsing
* an HTML or XML document. * an HTML or XML document.
@ -96,7 +164,7 @@ TY_PRIVATE Node *TY_(RemoveNode)(Node *node);
/** /**
* Remove node from markup tree and discard it. * Remove node from markup tree and discard it.
* @param doc The Tidy document from which to discarb the node. * @param doc The Tidy document from which to discard the node.
* @param element The node to discard. * @param element The node to discard.
* @returns Returns the next node. * @returns Returns the next node.
*/ */
@ -202,4 +270,3 @@ TY_PRIVATE void TY_(ParseXMLDocument)( TidyDocImpl* doc );
/** @} end internal_api group */ /** @} end internal_api group */
#endif /* __PARSER_H__ */ #endif /* __PARSER_H__ */

View file

@ -61,8 +61,13 @@ typedef enum
/** This typedef describes a function to be used to parse HTML of a Tidy tag. /** This typedef describes a function to be used to parse HTML of a Tidy tag.
** @param doc The Tidy document.
** @param node The node being parsed.
** @param mode The GetTokenMode to be used for parsing the node contents.
** @param popStack A flag indicating that we are re-entering this parser, and
** it should restore a state from the stack.
*/ */
typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode ); typedef Node* (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode, Bool popStack );
/** This typedef describes a function be be used to check the attributes /** This typedef describes a function be be used to check the attributes

View file

@ -16,6 +16,7 @@
#include "pprint.h" #include "pprint.h"
#include "access.h" #include "access.h"
#include "message.h" #include "message.h"
#include "parser.h"
#ifndef MAX #ifndef MAX
#define MAX(a,b) (((a) > (b))?(a):(b)) #define MAX(a,b) (((a) > (b))?(a):(b))
@ -54,6 +55,7 @@ struct _TidyDocImpl
StreamIn* docIn; StreamIn* docIn;
StreamOut* docOut; StreamOut* docOut;
StreamOut* errout; StreamOut* errout;
TidyReportFilter reportFilter; TidyReportFilter reportFilter;
TidyReportCallback reportCallback; TidyReportCallback reportCallback;
TidyMessageCallback messageCallback; TidyMessageCallback messageCallback;
@ -62,6 +64,8 @@ struct _TidyDocImpl
TidyConfigChangeCallback pConfigChangeCallback; TidyConfigChangeCallback pConfigChangeCallback;
TidyPPProgress progressCallback; TidyPPProgress progressCallback;
TidyParserStack stack;
/* Parse + Repair Results */ /* Parse + Repair Results */
uint optionErrors; uint optionErrors;
uint errors; uint errors;

View file

@ -112,6 +112,7 @@ TidyDocImpl* tidyDocCreate( TidyAllocator *allocator )
TY_(InitAttrs)( doc ); TY_(InitAttrs)( doc );
TY_(InitConfig)( doc ); TY_(InitConfig)( doc );
TY_(InitPrintBuf)( doc ); TY_(InitPrintBuf)( doc );
TY_(InitParserStack)( doc );
/* Set the locale for tidy's output. This both configures /* Set the locale for tidy's output. This both configures
** LibTidy to use the environment's locale as well as the ** LibTidy to use the environment's locale as well as the
@ -172,6 +173,7 @@ void tidyDocRelease( TidyDocImpl* doc )
* to determine which hash is to be used, so free it last. * to determine which hash is to be used, so free it last.
\*/ \*/
TY_(FreeLexer)( doc ); TY_(FreeLexer)( doc );
TY_(FreeParserStack)( doc );
TidyDocFree( doc, doc ); TidyDocFree( doc, doc );
} }
} }