Merge pull request #981 from htacg/iterate

Documentation and Recursion
2021-07-29 06:22:48 -04:00 · 2021-07-29 06:22:48 -04:00 · db847e6e1c
parent ab6b76dae9 e56716f154
commit db847e6e1c
9 changed files with 3698 additions and 1509 deletions
--- a/src/clean.c
+++ b/src/clean.c
@ -1585,11 +1585,16 @@ void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
 */
 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
 {
+    Stack *stack = TY_(newStack)(doc, 16);
+    Node *next;
+    
    tmbchar indent_buf[ 32 ];
    uint indent;

    while (node)
    {
+        next = node->next;
+        
        if ( nodeIsBLOCKQUOTE(node) && node->implicit )
        {
            indent = 1;
@ -1602,19 +1607,27 @@ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
                StripOnlyChild( doc, node );
            }

-            if (node->content)
-                TY_(BQ2Div)( doc, node->content );
-
            TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
                             2*indent);

            RenameElem( doc, node, TidyTag_DIV );
            TY_(AddStyleProperty)(doc, node, indent_buf );
+
+            if (node->content)
+            {
+                TY_(push)(stack, next);
+                node = node->content;
+                continue;
+            }
        }
        else if (node->content)
-            TY_(BQ2Div)( doc, node->content );
+        {
+            TY_(push)(stack, next);
+            node = node->content;
+            continue;
+        }

-        node = node->next;
+        node = next ? next : TY_(pop)(stack);
    }
 }

@ -2736,11 +2749,13 @@ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
 */
 static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
 {
+    Stack *stack = TY_(newStack)(doc, 16);
    Node *next;
+    
    while (node)
    {
-		next = node->next;	/* get 'next' now , in case the node is moved */
-		/* dbg_show_node(doc, node, 0, indent); */
+        next = node->next;
+        
        if (nodeIsSTYLE(node))
        {
            if (fix)
@ -2756,9 +2771,19 @@ static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int
        }
        else if (node->content)
        {
-			StyleToHead(doc, head, node->content, fix, indent + 1);
+            TY_(push)(stack, next);
+            node = node->content;
+            indent++;
+            continue;
+        }
+        
+        if (next)
+            node = next;
+        else
+        {
+            node = TY_(pop)(stack);
+            indent--;
        }
-		node = next;	/* process the 'next', if any */
    }
 }

--- a/src/lexer.c
+++ b/src/lexer.c
@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str )
    return 0;
 }

-/*
-   node->type is one of these:
-
-    #define TextNode    1
-    #define StartTag    2
-    #define EndTag      3
-    #define StartEndTag 4
-*/
-
 Lexer* TY_(NewLexer)( TidyDocImpl* doc )
 {
    Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
        }
    }
 #endif
-    /* this is no good ;=((
-    if (node && doc && doc->lexer) {
-        if (node == doc->lexer->token) {
-            doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer );
-        }
-    }
-      ----------------- */
+
    while ( node )
    {
        Node* next = node->next;
@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc)
    return NULL;
 }

-/*
- * local variables:
- * mode: c
- * indent-tabs-mode: nil
- * c-basic-offset: 4
- * eval: (c-set-offset 'substatement-open 0)
- * end:
+
+/****************************************************************************//*
+ ** MARK: - Node Stack
+ ***************************************************************************/
+
+
+/**
+ * Create a new stack with a given starting capacity. If memory allocation
+ * fails, then the allocator will panic the program automatically.
 */
+Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity)
+{
+    Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack));
+    stack->top = -1;
+    stack->capacity = capacity;
+    stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**));
+    stack->allocator = doc->allocator;
+    return stack;
+}
+ 
+
+/**
+ *  Increase the stack size. This will be called automatically when the
+ *  current stack is full. If memory allocation fails, then the allocator
+ *  will panic the program automatically.
+ */
+void TY_(growStack)(Stack *stack)
+{
+    uint new_capacity = stack->capacity * 2;
+    
+    Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity);
+    
+    memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) );
+    TidyFree(stack->allocator, stack->firstNode);
+
+    stack->firstNode = firstNode;
+    stack->capacity = new_capacity;
+}
+
+
+/**
+ * Stack is full when top is equal to the last index.
+ */
+Bool TY_(stackFull)(Stack *stack)
+{
+    return stack->top == stack->capacity - 1;
+}
+
+
+/**
+ * Stack is empty when top is equal to -1
+ */
+Bool TY_(stackEmpty)(Stack *stack)
+{
+    return stack->top == -1;
+}
+ 
+
+/**
+ * Push an item to the stack.
+ */
+void TY_(push)(Stack *stack, Node *node)
+{
+    if (TY_(stackFull)(stack))
+        TY_(growStack)(stack);
+    
+    if (node)
+        stack->firstNode[++stack->top] = node;
+}
+
+
+/**
+ * Pop an item from the stack.
+ */
+Node* TY_(pop)(Stack *stack)
+{
+    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
+}
+
+
+/**
+ * Peek at the stack.
+ */
+FUNC_UNUSED Node* TY_(peek)(Stack *stack)
+{
+    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
+}
+
+/**
+ *  Frees the stack when done.
+ */
+void TY_(freeStack)(Stack *stack)
+{
+    TidyFree( stack->allocator, stack->firstNode );
+    stack->top = -1;
+    stack->capacity = 0;
+    stack->firstNode = NULL;
+    stack->allocator = NULL;
+}
--- a/src/lexer.h
+++ b/src/lexer.h
@ -1,33 +1,46 @@
 #ifndef __LEXER_H__
 #define __LEXER_H__

-/* lexer.h -- Lexer for html parser

-   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
-   See tidy.h for the copyright notice.
-
-  Given an input source, it returns a sequence of tokens.
-
-     GetToken(source) gets the next token
-     UngetToken(source) provides one level undo
-
-  The tags include an attribute list:
-
-    - linked list of attribute/value nodes
-    - each node has 2 NULL-terminated strings.
-    - entities are replaced in attribute values
-
-  white space is compacted if not in preformatted mode
-  If not in preformatted mode then leading white space
-  is discarded and subsequent white space sequences
-  compacted to single space characters.
-
-  If XmlTags is no then Tag names are folded to upper
-  case and attribute names to lower case.
-
- Not yet done:
-    -   Doctype subset and marked sections
-*/
+/**************************************************************************//**
+ * @file
+ * Lexer for HTML and XML Parsers.
+ *
+ *   Given an input source, it returns a sequence of tokens.
+ *
+ *      GetToken(source) gets the next token
+ *      UngetToken(source) provides one level undo
+ *
+ *   The tags include an attribute list:
+ *
+ *     - linked list of attribute/value nodes
+ *     - each node has 2 NULL-terminated strings.
+ *     - entities are replaced in attribute values
+ *
+ *   white space is compacted if not in preformatted mode
+ *   If not in preformatted mode then leading white space
+ *   is discarded and subsequent white space sequences
+ *   compacted to single space characters.
+ *
+ *   If XmlTags is no then Tag names are folded to upper
+ *   case and attribute names to lower case.
+ *
+ *  Not yet done:
+ *     - Doctype subset and marked sections
+ *
+ * @author  HTACG, et al (consult git log)
+ *
+ * @copyright
+ *     (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG.
+ *     See tidy.h for the copyright notice.
+ * @par
+ *     All Rights Reserved.
+ * @par
+ *     See `tidy.h` for the complete license.
+ *
+ * @date Additional updates: consult git log
+ *
+ ******************************************************************************/

 #ifdef __cplusplus
 extern "C" {
@ -35,8 +48,23 @@ extern "C" {

 #include "forward.h"

-/* lexer character types
-*/
+/** @addtogroup internal_api */
+/** @{ */
+
+
+/***************************************************************************//**
+ ** @defgroup lexer_h HTML and XML Lexing
+ **
+ ** These functions and structures form the internal API for document
+ ** lexing.
+ **
+ ** @{
+ ******************************************************************************/
+
+
+/**
+ *  Lexer character types.
+ */
 #define digit       1u
 #define letter      2u
 #define namechar    4u
@ -47,8 +75,9 @@ extern "C" {
 #define digithex    128u


-/* node->type is one of these values
-*/
+/**
+ *  node->type is one of these values
+ */
 typedef enum
 {
  RootNode,
@ -68,9 +97,9 @@ typedef enum
 } NodeType;


-
-/* lexer GetToken states
-*/
+/**
+ *  Lexer GetToken() states.
+ */
 typedef enum
 {
  LEX_CONTENT,
@ -88,7 +117,10 @@ typedef enum
  LEX_XMLDECL
 } LexerState;

-/* ParseDocTypeDecl state constants */
+
+/**
+ *  ParseDocTypeDecl state constants.
+ */
 typedef enum
 {
  DT_INTERMEDIATE,
@ -98,67 +130,44 @@ typedef enum
  DT_INTSUBSET
 } ParseDocTypeDeclState;

-/* content model shortcut encoding

-   Descriptions are tentative.
-*/
+/**
+ *  Content model shortcut encoding.
+ *  Descriptions are tentative.
+ */
 #define CM_UNKNOWN      0
-/* Elements with no content. Map to HTML specification. */
-#define CM_EMPTY        (1 << 0)
-/* Elements that appear outside of "BODY". */
-#define CM_HTML         (1 << 1)
-/* Elements that can appear within HEAD. */
-#define CM_HEAD         (1 << 2)
-/* HTML "block" elements. */
-#define CM_BLOCK        (1 << 3)
-/* HTML "inline" elements. */
-#define CM_INLINE       (1 << 4)
-/* Elements that mark list item ("LI"). */
-#define CM_LIST         (1 << 5)
-/* Elements that mark definition list item ("DL", "DT"). */
-#define CM_DEFLIST      (1 << 6)
-/* Elements that can appear inside TABLE. */
-#define CM_TABLE        (1 << 7)
-/* Used for "THEAD", "TFOOT" or "TBODY". */
-#define CM_ROWGRP       (1 << 8)
-/* Used for "TD", "TH" */
-#define CM_ROW          (1 << 9)
-/* Elements whose content must be protected against white space movement.
-   Includes some elements that can found in forms. */
-#define CM_FIELD        (1 << 10)
-/* Used to avoid propagating inline emphasis inside some elements
-   such as OBJECT or APPLET. */
-#define CM_OBJECT       (1 << 11)
-/* Elements that allows "PARAM". */
-#define CM_PARAM        (1 << 12)
-/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
-#define CM_FRAMES       (1 << 13)
-/* Heading elements (h1, h2, ...). */
-#define CM_HEADING      (1 << 14)
-/* Elements with an optional end tag. */
-#define CM_OPT          (1 << 15)
-/* Elements that use "align" attribute for vertical position. */
-#define CM_IMG          (1 << 16)
-/* Elements with inline and block model. Used to avoid calling InlineDup. */
-#define CM_MIXED        (1 << 17)
-/* Elements whose content needs to be indented only if containing one 
-   CM_BLOCK element. */
-#define CM_NO_INDENT    (1 << 18)
-/* Elements that are obsolete (such as "dir", "menu"). */
-#define CM_OBSOLETE     (1 << 19)
-/* User defined elements. Used to determine how attributes without value
-   should be printed. */
-#define CM_NEW          (1 << 20)
-/* Elements that cannot be omitted. */
-#define CM_OMITST       (1 << 21)
+#define CM_EMPTY        (1 << 0)   /**< Elements with no content. Map to HTML specification. */
+#define CM_HTML         (1 << 1)   /**< Elements that appear outside of "BODY". */
+#define CM_HEAD         (1 << 2)   /**< Elements that can appear within HEAD. */
+#define CM_BLOCK        (1 << 3)   /**< HTML "block" elements. */
+#define CM_INLINE       (1 << 4)   /**< HTML "inline" elements. */
+#define CM_LIST         (1 << 5)   /**< Elements that mark list item ("LI"). */
+#define CM_DEFLIST      (1 << 6)   /**< Elements that mark definition list item ("DL", "DT"). */
+#define CM_TABLE        (1 << 7)   /**< Elements that can appear inside TABLE. */
+#define CM_ROWGRP       (1 << 8)   /**< Used for "THEAD", "TFOOT" or "TBODY". */
+#define CM_ROW          (1 << 9)   /**< Used for "TD", "TH" */
+#define CM_FIELD        (1 << 10)  /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */
+#define CM_OBJECT       (1 << 11)  /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */
+#define CM_PARAM        (1 << 12)  /**< Elements that allows "PARAM". */
+#define CM_FRAMES       (1 << 13)  /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
+#define CM_HEADING      (1 << 14)  /**< Heading elements (h1, h2, ...). */
+#define CM_OPT          (1 << 15)  /**< Elements with an optional end tag. */
+#define CM_IMG          (1 << 16)  /**< Elements that use "align" attribute for vertical position. */
+#define CM_MIXED        (1 << 17)  /**< Elements with inline and block model. Used to avoid calling InlineDup. */
+#define CM_NO_INDENT    (1 << 18)  /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */
+#define CM_OBSOLETE     (1 << 19)  /**< Elements that are obsolete (such as "dir", "menu"). */
+#define CM_NEW          (1 << 20)  /**< User defined elements. Used to determine how attributes without value should be printed. */
+#define CM_OMITST       (1 << 21)   /**< Elements that cannot be omitted. */

-/* If the document uses just HTML 2.0 tags and attributes described
-** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
-** If there are proprietary tags and attributes then describe it as
-** HTML Proprietary. If it includes the xml-lang or xmlns attributes
-** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
-** flavors of Voyager (strict, loose or frameset).
-*/
+
+/**
+ *  If the document uses just HTML 2.0 tags and attributes described
+ *  it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
+ *  If there are proprietary tags and attributes then describe it as
+ *  HTML Proprietary. If it includes the xml-lang or xmlns attributes
+ *  but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
+ *  flavors of Voyager (strict, loose or frameset).
+ */

 /* unknown */
 #define xxxx                   0u
@ -220,8 +229,10 @@ typedef enum
 /* all proprietary types */
 #define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)

-/* Linked list of class names and styles
-*/
+
+/**
+ *  Linked list of class names and styles
+ */
 struct _Style;
 typedef struct _Style TagStyle;

@ -234,8 +245,9 @@ struct _Style
 };


-/* Linked list of style properties
-*/
+/**
+ *  Linked list of style properties
+ */
 struct _StyleProp;
 typedef struct _StyleProp StyleProp;

@ -247,11 +259,9 @@ struct _StyleProp
 };


-
-
-/* Attribute/Value linked list node
-*/
-
+/**
+ *  Attribute/Value linked list node
+ */
 struct _AttVal
 {
    AttVal*           next;
@ -264,93 +274,89 @@ struct _AttVal
 };


-
-/*
-  Mosaic handles inlines via a separate stack from other elements
-  We duplicate this to recover from inline markup errors such as:
-
-     <i>italic text
-     <p>more italic text</b> normal text
-
-  which for compatibility with Mosaic is mapped to:
-
-     <i>italic text</i>
-     <p><i>more italic text</i> normal text
-
-  Note that any inline end tag pop's the effect of the current
-  inline start tag, so that </b> pop's <i> in the above example.
+/**
+ *  Mosaic handles inlines via a separate stack from other elements
+ *  We duplicate this to recover from inline markup errors such as:
+ *     ~~~
+ *     <i>italic text
+ *     <p>more italic text</b> normal text
+ *     ~~~
+ *  which for compatibility with Mosaic is mapped to:
+ *     ~~~
+ *     <i>italic text</i>
+ *     <p><i>more italic text</i> normal text
+ *     ~~~
+ *  Note that any inline end tag pop's the effect of the current
+ *  inline start tag, so that `</b>` pop's `<i>` in the above example.
 */
 struct _IStack
 {
    IStack*     next;
-    const Dict* tag;        /* tag's dictionary definition */
-    tmbstr      element;    /* name (NULL for text nodes) */
+    const Dict* tag;        /**< tag's dictionary definition */
+    tmbstr      element;    /**< name (NULL for text nodes) */
    AttVal*     attributes;
 };


-/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
-** etc. etc.
-*/
-
+/**
+ *  HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.
+ */
 struct _Node
 {
-    Node*       parent;         /* tree structure */
+    Node*       parent;         /**< tree structure */
    Node*       prev;
    Node*       next;
    Node*       content;
    Node*       last;

    AttVal*     attributes;
-    const Dict* was;            /* old tag when it was changed */
-    const Dict* tag;            /* tag's dictionary definition */
+    const Dict* was;            /**< old tag when it was changed */
+    const Dict* tag;            /**< tag's dictionary definition */

-    tmbstr      element;        /* name (NULL for text nodes) */
+    tmbstr      element;        /**< name (NULL for text nodes) */

-    uint        start;          /* start of span onto text array */
-    uint        end;            /* end of span onto text array */
-    NodeType    type;           /* TextNode, StartTag, EndTag etc. */
+    uint        start;          /**< start of span onto text array */
+    uint        end;            /**< end of span onto text array */
+    NodeType    type;           /**< TextNode, StartTag, EndTag etc. */

-    uint        line;           /* current line of document */
-    uint        column;         /* current column of document */
+    uint        line;           /**< current line of document */
+    uint        column;         /**< current column of document */

-    Bool        closed;         /* true if closed by explicit end tag */
-    Bool        implicit;       /* true if inferred */
-    Bool        linebreak;      /* true if followed by a line break */
+    Bool        closed;         /**< true if closed by explicit end tag */
+    Bool        implicit;       /**< true if inferred */
+    Bool        linebreak;      /**< true if followed by a line break */
 };


-/*
-  The following are private to the lexer
-  Use NewLexer() to create a lexer, and
-  FreeLexer() to free it.
-*/
-
+/**
+ *  The following are private to the lexer.
+ *  Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it.
+ */
 struct _Lexer
 {
-    uint lines;             /* lines seen */
-    uint columns;           /* at start of current token */
-    Bool waswhite;          /* used to collapse contiguous white space */
-    Bool pushed;            /* true after token has been pushed back */
-    Bool insertspace;       /* when space is moved after end tag */
-    Bool excludeBlocks;     /* Netscape compatibility */
-    Bool exiled;            /* true if moved out of table */
-    Bool isvoyager;         /* true if xmlns attribute on html element */
-    uint versions;          /* bit vector of HTML versions */
-    uint doctype;           /* version as given by doctype (if any) */
-    uint versionEmitted;    /* version of doctype emitted */
-    Bool bad_doctype;       /* e.g. if html or PUBLIC is missing */
-    uint txtstart;          /* start of current node */
-    uint txtend;            /* end of current node */
-    LexerState state;       /* state of lexer's finite state machine */
+    uint lines;                /**< lines seen */
+    uint columns;              /**< at start of current token */
+    Bool waswhite;             /**< used to collapse contiguous white space */
+    Bool pushed;               /**< true after token has been pushed back */
+    Bool insertspace;          /**< when space is moved after end tag */
+    Bool excludeBlocks;        /**< Netscape compatibility */
+    Bool exiled;               /**< true if moved out of table */
+    Bool isvoyager;            /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */
+    uint versions;             /**< bit vector of HTML versions */
+    uint doctype;              /**< version as given by doctype (if any) */
+    uint versionEmitted;       /**< version of doctype emitted */
+    Bool bad_doctype;          /**< e.g. if html or PUBLIC is missing */
+    uint txtstart;             /**< start of current node */
+    uint txtend;               /**< end of current node */
+    LexerState state;          /**< state of lexer's finite state machine */

-    Node* token;            /* last token returned by GetToken() */
-    Node* itoken;           /* last duplicate inline returned by GetToken() */
-    Node* root;             /* remember root node of the document */
-    Node* parent;           /* remember parent node for CDATA elements */
+    Node* token;               /**< last token returned by GetToken() */
+    Node* itoken;              /**< last duplicate inline returned by GetToken() */
+    Node* root;                /**< remember root node of the document */
+    Node* parent;              /**< remember parent node for CDATA elements */

-    Bool seenEndBody;       /* true if a </body> tag has been encountered */
-    Bool seenEndHtml;       /* true if a </html> tag has been encountered */
+    Bool seenEndBody;          /**< true if a `</body>` tag has been encountered */
+    Bool seenEndHtml;          /**< true if a `</html>` tag has been encountered */

    /*
      Lexer character buffer
@ -361,33 +367,57 @@ struct _Lexer

      lexsize must be reset for each file.
    */
-    tmbstr lexbuf;          /* MB character buffer */
-    uint lexlength;         /* allocated */
-    uint lexsize;           /* used */
+    tmbstr lexbuf;             /**< MB character buffer */
+    uint lexlength;            /**< allocated */
+    uint lexsize;              /**< used */

    /* Inline stack for compatibility with Mosaic */
-    Node* inode;            /* for deferring text node */
-    IStack* insert;         /* for inferring inline tags */
+    Node* inode;               /**< for deferring text node */
+    IStack* insert;            /**< for inferring inline tags */
    IStack* istack;
-    uint istacklength;      /* allocated */
-    uint istacksize;        /* used */
-    uint istackbase;        /* start of frame */
+    uint istacklength;         /**< allocated */
+    uint istacksize;           /**< used */
+    uint istackbase;           /**< start of frame */

-    TagStyle *styles;          /* used for cleaning up presentation markup */
+    TagStyle *styles;          /**< used for cleaning up presentation markup */

-    TidyAllocator* allocator; /* allocator */
+    TidyAllocator* allocator;  /**< allocator */
 };


-/* Lexer Functions
-*/
+/**
+ *  modes for GetToken()
+ *
+ *  MixedContent   -- for elements which don't accept PCDATA
+ *  Preformatted   -- white space preserved as is
+ *  IgnoreMarkup   -- for CDATA elements such as script, style
+ */
+typedef enum
+{
+  IgnoreWhitespace,
+  MixedContent,
+  Preformatted,
+  IgnoreMarkup,
+  OtherNamespace,
+  CdataContent
+} GetTokenMode;

-/* choose what version to use for new doctype */
+
+/** @name Lexer Functions
+ *  @{
+ */
+
+
+/**
+ *  Choose what version to use for new doctype
+ */
 TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc );

-/* everything is allowed in proprietary version of HTML */
-/* this is handled here rather than in the tag/attr dicts */

+/**
+ *  Everything is allowed in proprietary version of HTML.
+ *  This is handled here rather than in the tag/attr dicts
+ */
 TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );

 TY_PRIVATE Bool TY_(IsWhite)(uint c);
@ -399,7 +429,6 @@ TY_PRIVATE Bool TY_(IsNamechar)(uint c);
 TY_PRIVATE Bool TY_(IsXMLLetter)(uint c);
 TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c);

-/* Bool IsLower(uint c); */
 TY_PRIVATE Bool TY_(IsUpper)(uint c);
 TY_PRIVATE uint TY_(ToLower)(uint c);
 TY_PRIVATE uint TY_(ToUpper)(uint c);
@ -407,60 +436,82 @@ TY_PRIVATE uint TY_(ToUpper)(uint c);
 TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc );
 TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc );

-/* store character c as UTF-8 encoded byte stream */
+
+/**
+ *  Store character c as UTF-8 encoded byte stream
+ */
 TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c );

-/*
-  Used for elements and text nodes
-  element name is NULL for text nodes
-  start and end are offsets into lexbuf
-  which contains the textual content of
-  all elements in the parse tree.

-  parent and content allow traversal
-  of the parse tree in any direction.
-  attributes are represented as a linked
-  list of AttVal nodes which hold the
-  strings for attribute/value pairs.
+/**
+ *  Used for elements and text nodes.
+ *   - Element name is NULL for text nodes.
+ *   - start and end are offsets into lexbuf,
+ *     which contains the textual content of
+ *     all elements in the parse tree.
+ *   - parent and content allow traversal
+ *     of the parse tree in any direction.
+ *   - attributes are represented as a linked
+ *     list of AttVal nodes which hold the
+ *     strings for attribute/value pairs.
 */
 TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );


-/* used to clone heading nodes when split by an <HR> */
+/**
+ *  Used to clone heading nodes when split by an `<HR>`
+ */
 TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );

-/* free node's attributes */
+
+/**
+ *  Free node's attributes
+ */
 TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );

-/* doesn't repair attribute list linkage */
+
+/**
+ *  Doesn't repair attribute list linkage
+ */
 TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );

-/* detach attribute from node */
+
+/**
+ * Detach attribute from node
+ */
 TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr );

-/* detach attribute from node then free it
-*/
+
+/**
+ *  Detach attribute from node then free it.
+ */
 TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );

-/*
-  Free document nodes by iterating through peers and recursing
-  through children. Set next to NULL before calling FreeNode()
-  to avoid freeing peer nodes. Doesn't patch up prev/next links.
+
+/**
+ *  Free document nodes by iterating through peers and recursing
+ *  through children. Set `next` to `NULL` before calling `FreeNode()`
+ *  to avoid freeing peer nodes. Doesn't patch up prev/next links.
 */
 TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node );

+
 TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer );

-/* used for creating preformatted text from Word2000 */
+
+/**
+ *  Used for creating preformatted text from Word2000.
+ */
 TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer );

-/* used for adding a &nbsp; for Word2000 */
+
+/**
+ *  Used for adding a &nbsp; for Word2000.
+ */
 TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );

-TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
-/* TY_PRIVATE void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */

-/* find element */
+TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
 TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc );
 TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc );
 TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc );
@ -468,10 +519,16 @@ TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc);
 TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc );
 TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc);

-/* Returns containing block element, if any */
+
+/**
+ *  Returns containing block element, if any
+ */
 TY_PRIVATE Node* TY_(FindContainer)( Node* node );

-/* add meta element for Tidy */
+
+/**
+ *  Add meta element for Tidy.
+ */
 TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc );

 TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc );
@ -485,118 +542,209 @@ TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
 TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );


-/* fixup doctype if missing */
+/**
+ *  Fixup doctype if missing.
+ */
 TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc );

-/* ensure XML document starts with <?xml version="1.0"?> */
-/* add encoding attribute if not using ASCII or UTF-8 output */
+
+/**
+ *  Ensure XML document starts with <?xml version="1.0"?>,and
+ *  add encoding attribute if not using ASCII or UTF-8 output.
+ */
 TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc );

+
 TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);

 TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc );

-
-/*
-  modes for GetToken()
-
-  MixedContent   -- for elements which don't accept PCDATA
-  Preformatted   -- white space preserved as is
-  IgnoreMarkup   -- for CDATA elements such as script, style
-*/
-typedef enum
-{
-  IgnoreWhitespace,
-  MixedContent,
-  Preformatted,
-  IgnoreMarkup,
-  OtherNamespace,
-  CdataContent
-} GetTokenMode;
-
 TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );

 TY_PRIVATE void TY_(InitMap)(void);


-/* create a new attribute */
+/**
+ *  Create a new attribute.
+ */
 TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc );

-/* create a new attribute with given name and value */
+
+/**
+ *  Create a new attribute with given name and value.
+ */
 TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
                             int delim );

-/* insert attribute at the end of attribute list of a node */
+
+/**
+ *  Insert attribute at the end of attribute list of a node.
+ */
 TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );

-/* insert attribute at the start of attribute list of a node */
+/**
+ *  Insert attribute at the start of attribute list of a node.
+ */
 TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );

-/*************************************
-  In-line Stack functions
-*************************************/
+
+/** @}
+ *  @name Inline Stack Functions
+ *  @{
+ */


-/* duplicate attributes */
+/**
+ *  Duplicate attributes.
+ */
 TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );

-/*
-  push a copy of an inline node onto stack
-  but don't push if implicit or OBJECT or APPLET
-  (implicit tags are ones generated from the istack)

-  One issue arises with pushing inlines when
-  the tag is already pushed. For instance:
-
-      <p><em>text
-      <p><em>more text
-
-  Shouldn't be mapped to
-
-      <p><em>text</em></p>
-      <p><em><em>more text</em></em>
-*/
+/**
+ *  Push a copy of an inline node onto stack, but don't push if
+ *  implicit or OBJECT or APPLET (implicit tags are ones generated
+ *  from the istack).
+ *
+ *  One issue arises with pushing inlines when the tag is already pushed.
+ *  For instance:
+ *    ~~~
+ *    <p><em>text
+ *    <p><em>more text
+ *    ~~~
+ *  Shouldn't be mapped to
+ *    ~~~
+ *    <p><em>text</em></p>
+ *    <p><em><em>more text</em></em>
+ *    ~~~
+ */
 TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node );

-/* pop inline stack */
+
+/**
+ * Pop inline stack.
+ */
 TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node );

+
 TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
 TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );

-/*
-  This has the effect of inserting "missing" inline
-  elements around the contents of blocklevel elements
-  such as P, TD, TH, DIV, PRE etc. This procedure is
-  called at the start of ParseBlock. when the inline
-  stack is not empty, as will be the case in:

-    <i><h1>italic heading</h1></i>
-
-  which is then treated as equivalent to
-
-    <h1><i>italic heading</i></h1>
-
-  This is implemented by setting the lexer into a mode
-  where it gets tokens from the inline stack rather than
-  from the input stream.
-*/
+/**
+ *  This has the effect of inserting "missing" inline elements around the
+ *  contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This
+ *  procedure is called at the start of `ParseBlock`, when the inline
+ *  stack is not empty, as will be the case in:
+ *    ~~~
+ *    <i><h1>italic heading</h1></i>
+ *    ~~~
+ *  which is then treated as equivalent to
+ *    ~~~
+ *    <h1><i>italic heading</i></h1>
+ *    ~~~
+ *  This is implemented by setting the lexer into a mode where it gets
+ *  tokens from the inline stack rather than from the input stream.
+ */
 TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );

-/*
- defer duplicates when entering a table or other
- element where the inlines shouldn't be duplicated
-*/
+
+/**
+ *  Fefer duplicates when entering a table or other
+ *  element where the inlines shouldn't be duplicated.
+ */
 TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
+
+
 TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc );

-/* stack manipulation for inline elements */
+/**
+ *  Stack manipulation for inline elements
+ */
 TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
+
+
 TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );

+
+/** @}
+ *  @name Generic stack of nodes.
+ *  @{
+ */
+
+
+/**
+ * This typedef represents a stack of addresses to nodes. Tidy uses these to
+ * try to limit recursion by pushing nodes to a stack when possible instead
+ * of recursing.
+ */
+typedef struct _Stack {
+    int top;                        /**< Current top position. */
+    unsigned capacity;              /**< Current capacity. Can be expanded. */
+    Node **firstNode;               /** A pointer to the first pointer to a Node in an array of node addresses. */
+    TidyAllocator* allocator;       /**< Tidy's allocator, used at instantiation and expanding. */
+} Stack;
+ 
+
+/**
+ * Create a new stack with a given starting capacity. If memory allocation
+ * fails, then the allocator will panic the program automatically.
+ */
+TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity);
+ 
+
+/**
+ *  Increase the stack size. This will be called automatically when the
+ *  current stack is full. If memory allocation fails, then the allocator
+ *  will panic the program automatically.
+ */
+TY_PRIVATE void TY_(growStack)(Stack *stack);
+
+
+/**
+ * Stack is full when top is equal to the last index.
+ */
+TY_PRIVATE Bool TY_(stackFull)(Stack *stack);
+
+
+/**
+ * Stack is empty when top is equal to -1
+ */
+TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack);
+ 
+
+/**
+ * Push an item to the stack.
+ */
+TY_PRIVATE void TY_(push)(Stack *stack, Node *node);
+
+
+/**
+ * Pop an item from the stack.
+ */
+TY_PRIVATE Node* TY_(pop)(Stack *stack);
+
+
+/**
+ * Peek at the stack.
+ */
+TY_PRIVATE Node* TY_(peek)(Stack *stack);
+
+/**
+ *  Frees the stack when done.
+ */
+TY_PRIVATE void TY_(freeStack)(Stack *stack);
+
+
+/** @}
+ */
+
+
 #ifdef __cplusplus
 }
 #endif


+/** @} end parser_h group */
+/** @} end internal_api group */
+
 #endif /* __LEXER_H__ */
--- a/src/parser.c
+++ b/src/parser.c
--- a/src/parser.h
+++ b/src/parser.h
@ -41,6 +41,74 @@
 ******************************************************************************/


+/**
+ *  The parsers keeps track of their states with the states defined here, and
+ *  use these symbols when pushing to the stack so that they can later recreate
+ *  their environments when re-entered.
+ */
+typedef enum {
+    /* Universal states. */
+    STATE_INITIAL,             /**< This is the initial state for every parser. */
+    STATE_COMPLETE,            /**< Complete! */
+    STATE_PARSE_TAG,
+    STATE_PARSE_TAG_DONE,
+    /* ParseHTML states. */
+    STATE_PRE_HEAD,            /**< In this state, we've not detected head yet. */
+    STATE_PRE_BODY,            /**< In this state, we'll consider frames vs. body. */
+    STATE_PARSE_BODY,          /**< In this state, we can parse the body. */
+    STATE_PARSE_HEAD,          /**< In this state, we will setup head for parsing. */
+    STATE_PARSE_HEAD_DONE,     /**< Resume here after parsing head. */
+    STATE_PARSE_NOFRAMES,      /**< In this state, we can parse noframes content. */
+    STATE_PARSE_NOFRAMES_DONE, /**< In this state, we can restore more state. */
+    STATE_PARSE_FRAMESET,      /**< In this state, we will parse frameset content. */
+    STATE_PARSE_FRAMESET_DONE, /**< We need to cleanup some things after parsing frameset. */
+} parserState;
+
+
+/**
+ *  This typedef represents the state of a parser when it enters and exits.
+ *  When the parser needs to finish work on the way back up the stack, it will
+ *  push one of these records to the stack, and it will pop a record from the
+ *  stack upon re-entry.
+ */
+typedef struct _TidyParserMemory
+{
+    Parser       *identity;      /**< Which parser pushed this record? */
+    Node         *original_node; /**< Originally provided node at entry. */
+    Node         *reentry_node;  /**< A node a parser might want to save. */
+    GetTokenMode reentry_mode;   /**< The mode to use for the next node. */
+    parserState  reentry_state;  /**< State to set during re-entry. */
+    GetTokenMode mode;           /**< The caller will peek at this value to get the correct mode. */
+} TidyParserMemory;
+
+
+/**
+ *  This typedef represents a stack of parserState. The Tidy document has its
+ *  own instance of this.
+ */
+typedef struct _TidyParserStack
+{
+    TidyParserMemory* content;    /**< A state record. */
+    TidyAllocator* allocator;     /**< The allocator used for creating. */
+    uint size;                    /**< Current size of the stack. */
+    int top;                      /**< Top of the stack. */
+} TidyParserStack;
+
+
+/**
+ *  Allocates and initializes the parser's stack. TidyCreate will perform
+ *  this automatically.
+ */
+void TY_(InitParserStack)( TidyDocImpl* doc );
+
+
+/**
+ *  Frees the parser's stack when done. TidyRelease will perform this
+ *  automatically.
+ */
+void TY_(FreeParserStack)( TidyDocImpl* doc );
+
+
 /**
 *  Is used to perform a node integrity check recursively after parsing
 *  an HTML or XML document.
@ -96,7 +164,7 @@ TY_PRIVATE Node *TY_(RemoveNode)(Node *node);

 /**
 *  Remove node from markup tree and discard it.
- *  @param doc The Tidy document from which to discarb the node.
+ *  @param doc The Tidy document from which to discard the node.
 *  @param element The node to discard.
 *  @returns Returns the next node.
 */
@ -202,4 +270,3 @@ TY_PRIVATE void TY_(ParseXMLDocument)( TidyDocImpl* doc );
 /** @} end internal_api group */

 #endif /* __PARSER_H__ */
-
--- a/src/tags.h
+++ b/src/tags.h
@ -61,8 +61,13 @@ typedef enum


 /** This typedef describes a function to be used to parse HTML of a Tidy tag.
+ ** @param doc The Tidy document.
+ ** @param node The node being parsed.
+ ** @param mode The GetTokenMode to be used for parsing the node contents.
+ ** @param popStack A flag indicating that we are re-entering this parser, and
+ **   it should restore a state from the stack.
 */
-typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
+typedef Node* (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode, Bool popStack );


 /** This typedef describes a function be be used to check the attributes
--- a/src/tidy-int.h
+++ b/src/tidy-int.h
@ -16,6 +16,7 @@
 #include "pprint.h"
 #include "access.h"
 #include "message.h"
+#include "parser.h"

 #ifndef MAX
 #define MAX(a,b) (((a) > (b))?(a):(b))
@ -54,6 +55,7 @@ struct _TidyDocImpl
    StreamIn*                docIn;
    StreamOut*               docOut;
    StreamOut*               errout;
+
    TidyReportFilter         reportFilter;
    TidyReportCallback       reportCallback;
    TidyMessageCallback      messageCallback;
@ -62,6 +64,8 @@ struct _TidyDocImpl
    TidyConfigChangeCallback pConfigChangeCallback;
    TidyPPProgress           progressCallback;

+    TidyParserStack          stack;
+
    /* Parse + Repair Results */
    uint                optionErrors;
    uint                errors;
--- a/src/tidylib.c
+++ b/src/tidylib.c
@ -112,6 +112,7 @@ TidyDocImpl* tidyDocCreate( TidyAllocator *allocator )
    TY_(InitAttrs)( doc );
    TY_(InitConfig)( doc );
    TY_(InitPrintBuf)( doc );
+    TY_(InitParserStack)( doc );

    /* Set the locale for tidy's output. This both configures
    ** LibTidy to use the environment's locale as well as the
@ -172,6 +173,7 @@ void          tidyDocRelease( TidyDocImpl* doc )
         *  to determine which hash is to be used, so free it last.
        \*/
        TY_(FreeLexer)( doc );
+        TY_(FreeParserStack)( doc );
        TidyDocFree( doc, doc );
    }
 }