Merge pull request #981 from htacg/iterate

Documentation and Recursion
2021-07-29 06:22:48 -04:00 · 2021-07-29 06:22:48 -04:00 · db847e6e1c
parent ab6b76dae9 e56716f154
commit db847e6e1c
9 changed files with 3698 additions and 1509 deletions
--- a/src/clean.c
+++ b/src/clean.c
@ -1585,11 +1585,16 @@ void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
 */
 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
 {
    Stack *stack = TY_(newStack)(doc, 16);
    Node *next;
    tmbchar indent_buf[ 32 ];
    uint indent;
    while (node)
    {
        next = node->next;
        if ( nodeIsBLOCKQUOTE(node) && node->implicit )
        {
            indent = 1;
@ -1602,19 +1607,27 @@ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
                StripOnlyChild( doc, node );
            }
            if (node->content)
                TY_(BQ2Div)( doc, node->content );
            TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
                             2*indent);
            RenameElem( doc, node, TidyTag_DIV );
            TY_(AddStyleProperty)(doc, node, indent_buf );
            if (node->content)
            {
                TY_(push)(stack, next);
                node = node->content;
                continue;
            }
        }
        else if (node->content)
-            TY_(BQ2Div)( doc, node->content );
+        {
            TY_(push)(stack, next);
            node = node->content;
            continue;
        }
-        node = node->next;
+        node = next ? next : TY_(pop)(stack);
    }
 }
@ -2736,30 +2749,42 @@ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
 */
 static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
 {
-	Node *next;
+    Stack *stack = TY_(newStack)(doc, 16);
-	while (node)
+    Node *next;
-	{
+    
-		next = node->next;	/* get 'next' now , in case the node is moved */
+    while (node)
-		/* dbg_show_node(doc, node, 0, indent); */
+    {
-		if (nodeIsSTYLE(node))
+        next = node->next;
-		{
+        
-			if (fix)
+        if (nodeIsSTYLE(node))
-			{
+        {
-				TY_(RemoveNode)(node); /* unhook style node from body */
+            if (fix)
-				TY_(InsertNodeAtEnd)(head, node);   /* add to end of head */
+            {
-				TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
+                TY_(RemoveNode)(node); /* unhook style node from body */
-			}
+                TY_(InsertNodeAtEnd)(head, node);   /* add to end of head */
-			else
+                TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
-			{
+            }
-				TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
+            else
-			}
+            {
-		}
+                TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
-		else if (node->content)
+            }
-		{
+        }
-			StyleToHead(doc, head, node->content, fix, indent + 1);
+        else if (node->content)
-		}
+        {
-		node = next;	/* process the 'next', if any */
+            TY_(push)(stack, next);
-	}
+            node = node->content;
            indent++;
            continue;
        }
        if (next)
            node = next;
        else
        {
            node = TY_(pop)(stack);
            indent--;
        }
    }
 }
--- a/src/lexer.c
+++ b/src/lexer.c
@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str )
    return 0;
 }
 /*
   node->type is one of these:
    #define TextNode    1
    #define StartTag    2
    #define EndTag      3
    #define StartEndTag 4
 */
 Lexer* TY_(NewLexer)( TidyDocImpl* doc )
 {
    Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
        }
    }
 #endif
-    /* this is no good ;=((
+
    if (node && doc && doc->lexer) {
        if (node == doc->lexer->token) {
            doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer );
        }
    }
      ----------------- */
    while ( node )
    {
        Node* next = node->next;
@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc)
    return NULL;
 }
-/*
+
- * local variables:
+/****************************************************************************//*
- * mode: c
+ ** MARK: - Node Stack
- * indent-tabs-mode: nil
+ ***************************************************************************/
- * c-basic-offset: 4
+
- * eval: (c-set-offset 'substatement-open 0)
+
- * end:
+/**
 * Create a new stack with a given starting capacity. If memory allocation
 * fails, then the allocator will panic the program automatically.
 */
 Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity)
 {
    Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack));
    stack->top = -1;
    stack->capacity = capacity;
    stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**));
    stack->allocator = doc->allocator;
    return stack;
 }
 /**
 *  Increase the stack size. This will be called automatically when the
 *  current stack is full. If memory allocation fails, then the allocator
 *  will panic the program automatically.
 */
 void TY_(growStack)(Stack *stack)
 {
    uint new_capacity = stack->capacity * 2;
    Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity);
    memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) );
    TidyFree(stack->allocator, stack->firstNode);
    stack->firstNode = firstNode;
    stack->capacity = new_capacity;
 }
 /**
 * Stack is full when top is equal to the last index.
 */
 Bool TY_(stackFull)(Stack *stack)
 {
    return stack->top == stack->capacity - 1;
 }
 /**
 * Stack is empty when top is equal to -1
 */
 Bool TY_(stackEmpty)(Stack *stack)
 {
    return stack->top == -1;
 }
 /**
 * Push an item to the stack.
 */
 void TY_(push)(Stack *stack, Node *node)
 {
    if (TY_(stackFull)(stack))
        TY_(growStack)(stack);
    if (node)
        stack->firstNode[++stack->top] = node;
 }
 /**
 * Pop an item from the stack.
 */
 Node* TY_(pop)(Stack *stack)
 {
    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
 }
 /**
 * Peek at the stack.
 */
 FUNC_UNUSED Node* TY_(peek)(Stack *stack)
 {
    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
 }
 /**
 *  Frees the stack when done.
 */
 void TY_(freeStack)(Stack *stack)
 {
    TidyFree( stack->allocator, stack->firstNode );
    stack->top = -1;
    stack->capacity = 0;
    stack->firstNode = NULL;
    stack->allocator = NULL;
 }
--- a/src/lexer.h
+++ b/src/lexer.h
@ -1,33 +1,46 @@
 #ifndef __LEXER_H__
 #define __LEXER_H__
 /* lexer.h -- Lexer for html parser
   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
   See tidy.h for the copyright notice.
-  Given an input source, it returns a sequence of tokens.
+/**************************************************************************//**
-
+ * @file
-     GetToken(source) gets the next token
+ * Lexer for HTML and XML Parsers.
-     UngetToken(source) provides one level undo
+ *
-
+ *   Given an input source, it returns a sequence of tokens.
-  The tags include an attribute list:
+ *
-
+ *      GetToken(source) gets the next token
-    - linked list of attribute/value nodes
+ *      UngetToken(source) provides one level undo
-    - each node has 2 NULL-terminated strings.
+ *
-    - entities are replaced in attribute values
+ *   The tags include an attribute list:
-
+ *
-  white space is compacted if not in preformatted mode
+ *     - linked list of attribute/value nodes
-  If not in preformatted mode then leading white space
+ *     - each node has 2 NULL-terminated strings.
-  is discarded and subsequent white space sequences
+ *     - entities are replaced in attribute values
-  compacted to single space characters.
+ *
-
+ *   white space is compacted if not in preformatted mode
-  If XmlTags is no then Tag names are folded to upper
+ *   If not in preformatted mode then leading white space
-  case and attribute names to lower case.
+ *   is discarded and subsequent white space sequences
-
+ *   compacted to single space characters.
- Not yet done:
+ *
-    -   Doctype subset and marked sections
+ *   If XmlTags is no then Tag names are folded to upper
-*/
+ *   case and attribute names to lower case.
 *
 *  Not yet done:
 *     - Doctype subset and marked sections
 *
 * @author  HTACG, et al (consult git log)
 *
 * @copyright
 *     (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG.
 *     See tidy.h for the copyright notice.
 * @par
 *     All Rights Reserved.
 * @par
 *     See `tidy.h` for the complete license.
 *
 * @date Additional updates: consult git log
 *
 ******************************************************************************/
 #ifdef __cplusplus
 extern "C" {
@ -35,8 +48,23 @@ extern "C" {
 #include "forward.h"
-/* lexer character types
+/** @addtogroup internal_api */
-*/
+/** @{ */
 /***************************************************************************//**
 ** @defgroup lexer_h HTML and XML Lexing
 **
 ** These functions and structures form the internal API for document
 ** lexing.
 **
 ** @{
 ******************************************************************************/
 /**
 *  Lexer character types.
 */
 #define digit       1u
 #define letter      2u
 #define namechar    4u
@ -47,8 +75,9 @@ extern "C" {
 #define digithex    128u
-/* node->type is one of these values
+/**
-*/
+ *  node->type is one of these values
 */
 typedef enum
 {
  RootNode,
@ -68,9 +97,9 @@ typedef enum
 } NodeType;
-
+/**
-/* lexer GetToken states
+ *  Lexer GetToken() states.
-*/
+ */
 typedef enum
 {
  LEX_CONTENT,
@ -88,7 +117,10 @@ typedef enum
  LEX_XMLDECL
 } LexerState;
-/* ParseDocTypeDecl state constants */
+
 /**
 *  ParseDocTypeDecl state constants.
 */
 typedef enum
 {
  DT_INTERMEDIATE,
@ -98,67 +130,44 @@ typedef enum
  DT_INTSUBSET
 } ParseDocTypeDeclState;
 /* content model shortcut encoding
-   Descriptions are tentative.
+/**
-*/
+ *  Content model shortcut encoding.
 *  Descriptions are tentative.
 */
 #define CM_UNKNOWN      0
-/* Elements with no content. Map to HTML specification. */
+#define CM_EMPTY        (1 << 0)   /**< Elements with no content. Map to HTML specification. */
-#define CM_EMPTY        (1 << 0)
+#define CM_HTML         (1 << 1)   /**< Elements that appear outside of "BODY". */
-/* Elements that appear outside of "BODY". */
+#define CM_HEAD         (1 << 2)   /**< Elements that can appear within HEAD. */
-#define CM_HTML         (1 << 1)
+#define CM_BLOCK        (1 << 3)   /**< HTML "block" elements. */
-/* Elements that can appear within HEAD. */
+#define CM_INLINE       (1 << 4)   /**< HTML "inline" elements. */
-#define CM_HEAD         (1 << 2)
+#define CM_LIST         (1 << 5)   /**< Elements that mark list item ("LI"). */
-/* HTML "block" elements. */
+#define CM_DEFLIST      (1 << 6)   /**< Elements that mark definition list item ("DL", "DT"). */
-#define CM_BLOCK        (1 << 3)
+#define CM_TABLE        (1 << 7)   /**< Elements that can appear inside TABLE. */
-/* HTML "inline" elements. */
+#define CM_ROWGRP       (1 << 8)   /**< Used for "THEAD", "TFOOT" or "TBODY". */
-#define CM_INLINE       (1 << 4)
+#define CM_ROW          (1 << 9)   /**< Used for "TD", "TH" */
-/* Elements that mark list item ("LI"). */
+#define CM_FIELD        (1 << 10)  /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */
-#define CM_LIST         (1 << 5)
+#define CM_OBJECT       (1 << 11)  /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */
-/* Elements that mark definition list item ("DL", "DT"). */
+#define CM_PARAM        (1 << 12)  /**< Elements that allows "PARAM". */
-#define CM_DEFLIST      (1 << 6)
+#define CM_FRAMES       (1 << 13)  /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
-/* Elements that can appear inside TABLE. */
+#define CM_HEADING      (1 << 14)  /**< Heading elements (h1, h2, ...). */
-#define CM_TABLE        (1 << 7)
+#define CM_OPT          (1 << 15)  /**< Elements with an optional end tag. */
-/* Used for "THEAD", "TFOOT" or "TBODY". */
+#define CM_IMG          (1 << 16)  /**< Elements that use "align" attribute for vertical position. */
-#define CM_ROWGRP       (1 << 8)
+#define CM_MIXED        (1 << 17)  /**< Elements with inline and block model. Used to avoid calling InlineDup. */
-/* Used for "TD", "TH" */
+#define CM_NO_INDENT    (1 << 18)  /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */
-#define CM_ROW          (1 << 9)
+#define CM_OBSOLETE     (1 << 19)  /**< Elements that are obsolete (such as "dir", "menu"). */
-/* Elements whose content must be protected against white space movement.
+#define CM_NEW          (1 << 20)  /**< User defined elements. Used to determine how attributes without value should be printed. */
-   Includes some elements that can found in forms. */
+#define CM_OMITST       (1 << 21)   /**< Elements that cannot be omitted. */
 #define CM_FIELD        (1 << 10)
 /* Used to avoid propagating inline emphasis inside some elements
   such as OBJECT or APPLET. */
 #define CM_OBJECT       (1 << 11)
 /* Elements that allows "PARAM". */
 #define CM_PARAM        (1 << 12)
 /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
 #define CM_FRAMES       (1 << 13)
 /* Heading elements (h1, h2, ...). */
 #define CM_HEADING      (1 << 14)
 /* Elements with an optional end tag. */
 #define CM_OPT          (1 << 15)
 /* Elements that use "align" attribute for vertical position. */
 #define CM_IMG          (1 << 16)
 /* Elements with inline and block model. Used to avoid calling InlineDup. */
 #define CM_MIXED        (1 << 17)
 /* Elements whose content needs to be indented only if containing one 
   CM_BLOCK element. */
 #define CM_NO_INDENT    (1 << 18)
 /* Elements that are obsolete (such as "dir", "menu"). */
 #define CM_OBSOLETE     (1 << 19)
 /* User defined elements. Used to determine how attributes without value
   should be printed. */
 #define CM_NEW          (1 << 20)
 /* Elements that cannot be omitted. */
 #define CM_OMITST       (1 << 21)
-/* If the document uses just HTML 2.0 tags and attributes described
+
-** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
+/**
-** If there are proprietary tags and attributes then describe it as
+ *  If the document uses just HTML 2.0 tags and attributes described
-** HTML Proprietary. If it includes the xml-lang or xmlns attributes
+ *  it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
-** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
+ *  If there are proprietary tags and attributes then describe it as
-** flavors of Voyager (strict, loose or frameset).
+ *  HTML Proprietary. If it includes the xml-lang or xmlns attributes
-*/
+ *  but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
 *  flavors of Voyager (strict, loose or frameset).
 */
 /* unknown */
 #define xxxx                   0u
@ -220,8 +229,10 @@ typedef enum
 /* all proprietary types */
 #define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
-/* Linked list of class names and styles
+
-*/
+/**
 *  Linked list of class names and styles
 */
 struct _Style;
 typedef struct _Style TagStyle;
@ -234,8 +245,9 @@ struct _Style
 };
-/* Linked list of style properties
+/**
-*/
+ *  Linked list of style properties
 */
 struct _StyleProp;
 typedef struct _StyleProp StyleProp;
@ -247,11 +259,9 @@ struct _StyleProp
 };
-
+/**
-
+ *  Attribute/Value linked list node
-/* Attribute/Value linked list node
+ */
 */
 struct _AttVal
 {
    AttVal*           next;
@ -264,93 +274,89 @@ struct _AttVal
 };
-
+/**
-/*
+ *  Mosaic handles inlines via a separate stack from other elements
-  Mosaic handles inlines via a separate stack from other elements
+ *  We duplicate this to recover from inline markup errors such as:
-  We duplicate this to recover from inline markup errors such as:
+ *     ~~~
-
+ *     <i>italic text
-     <i>italic text
+ *     <p>more italic text</b> normal text
-     <p>more italic text</b> normal text
+ *     ~~~
-
+ *  which for compatibility with Mosaic is mapped to:
-  which for compatibility with Mosaic is mapped to:
+ *     ~~~
-
+ *     <i>italic text</i>
-     <i>italic text</i>
+ *     <p><i>more italic text</i> normal text
-     <p><i>more italic text</i> normal text
+ *     ~~~
-
+ *  Note that any inline end tag pop's the effect of the current
-  Note that any inline end tag pop's the effect of the current
+ *  inline start tag, so that `</b>` pop's `<i>` in the above example.
  inline start tag, so that </b> pop's <i> in the above example.
 */
 struct _IStack
 {
    IStack*     next;
-    const Dict* tag;        /* tag's dictionary definition */
+    const Dict* tag;        /**< tag's dictionary definition */
-    tmbstr      element;    /* name (NULL for text nodes) */
+    tmbstr      element;    /**< name (NULL for text nodes) */
    AttVal*     attributes;
 };
-/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
+/**
-** etc. etc.
+ *  HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.
-*/
+ */
 struct _Node
 {
-    Node*       parent;         /* tree structure */
+    Node*       parent;         /**< tree structure */
    Node*       prev;
    Node*       next;
    Node*       content;
    Node*       last;
    AttVal*     attributes;
-    const Dict* was;            /* old tag when it was changed */
+    const Dict* was;            /**< old tag when it was changed */
-    const Dict* tag;            /* tag's dictionary definition */
+    const Dict* tag;            /**< tag's dictionary definition */
-    tmbstr      element;        /* name (NULL for text nodes) */
+    tmbstr      element;        /**< name (NULL for text nodes) */
-    uint        start;          /* start of span onto text array */
+    uint        start;          /**< start of span onto text array */
-    uint        end;            /* end of span onto text array */
+    uint        end;            /**< end of span onto text array */
-    NodeType    type;           /* TextNode, StartTag, EndTag etc. */
+    NodeType    type;           /**< TextNode, StartTag, EndTag etc. */
-    uint        line;           /* current line of document */
+    uint        line;           /**< current line of document */
-    uint        column;         /* current column of document */
+    uint        column;         /**< current column of document */
-    Bool        closed;         /* true if closed by explicit end tag */
+    Bool        closed;         /**< true if closed by explicit end tag */
-    Bool        implicit;       /* true if inferred */
+    Bool        implicit;       /**< true if inferred */
-    Bool        linebreak;      /* true if followed by a line break */
+    Bool        linebreak;      /**< true if followed by a line break */
 };
-/*
+/**
-  The following are private to the lexer
+ *  The following are private to the lexer.
-  Use NewLexer() to create a lexer, and
+ *  Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it.
-  FreeLexer() to free it.
+ */
 */
 struct _Lexer
 {
-    uint lines;             /* lines seen */
+    uint lines;                /**< lines seen */
-    uint columns;           /* at start of current token */
+    uint columns;              /**< at start of current token */
-    Bool waswhite;          /* used to collapse contiguous white space */
+    Bool waswhite;             /**< used to collapse contiguous white space */
-    Bool pushed;            /* true after token has been pushed back */
+    Bool pushed;               /**< true after token has been pushed back */
-    Bool insertspace;       /* when space is moved after end tag */
+    Bool insertspace;          /**< when space is moved after end tag */
-    Bool excludeBlocks;     /* Netscape compatibility */
+    Bool excludeBlocks;        /**< Netscape compatibility */
-    Bool exiled;            /* true if moved out of table */
+    Bool exiled;               /**< true if moved out of table */
-    Bool isvoyager;         /* true if xmlns attribute on html element */
+    Bool isvoyager;            /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */
-    uint versions;          /* bit vector of HTML versions */
+    uint versions;             /**< bit vector of HTML versions */
-    uint doctype;           /* version as given by doctype (if any) */
+    uint doctype;              /**< version as given by doctype (if any) */
-    uint versionEmitted;    /* version of doctype emitted */
+    uint versionEmitted;       /**< version of doctype emitted */
-    Bool bad_doctype;       /* e.g. if html or PUBLIC is missing */
+    Bool bad_doctype;          /**< e.g. if html or PUBLIC is missing */
-    uint txtstart;          /* start of current node */
+    uint txtstart;             /**< start of current node */
-    uint txtend;            /* end of current node */
+    uint txtend;               /**< end of current node */
-    LexerState state;       /* state of lexer's finite state machine */
+    LexerState state;          /**< state of lexer's finite state machine */
-    Node* token;            /* last token returned by GetToken() */
+    Node* token;               /**< last token returned by GetToken() */
-    Node* itoken;           /* last duplicate inline returned by GetToken() */
+    Node* itoken;              /**< last duplicate inline returned by GetToken() */
-    Node* root;             /* remember root node of the document */
+    Node* root;                /**< remember root node of the document */
-    Node* parent;           /* remember parent node for CDATA elements */
+    Node* parent;              /**< remember parent node for CDATA elements */
-    
+
-    Bool seenEndBody;       /* true if a </body> tag has been encountered */
+    Bool seenEndBody;          /**< true if a `</body>` tag has been encountered */
-    Bool seenEndHtml;       /* true if a </html> tag has been encountered */
+    Bool seenEndHtml;          /**< true if a `</html>` tag has been encountered */
    /*
      Lexer character buffer
@ -361,33 +367,57 @@ struct _Lexer
      lexsize must be reset for each file.
    */
-    tmbstr lexbuf;          /* MB character buffer */
+    tmbstr lexbuf;             /**< MB character buffer */
-    uint lexlength;         /* allocated */
+    uint lexlength;            /**< allocated */
-    uint lexsize;           /* used */
+    uint lexsize;              /**< used */
    /* Inline stack for compatibility with Mosaic */
-    Node* inode;            /* for deferring text node */
+    Node* inode;               /**< for deferring text node */
-    IStack* insert;         /* for inferring inline tags */
+    IStack* insert;            /**< for inferring inline tags */
    IStack* istack;
-    uint istacklength;      /* allocated */
+    uint istacklength;         /**< allocated */
-    uint istacksize;        /* used */
+    uint istacksize;           /**< used */
-    uint istackbase;        /* start of frame */
+    uint istackbase;           /**< start of frame */
-    TagStyle *styles;          /* used for cleaning up presentation markup */
+    TagStyle *styles;          /**< used for cleaning up presentation markup */
-    TidyAllocator* allocator; /* allocator */
+    TidyAllocator* allocator;  /**< allocator */
 };
-/* Lexer Functions
+/**
-*/
+ *  modes for GetToken()
 *
 *  MixedContent   -- for elements which don't accept PCDATA
 *  Preformatted   -- white space preserved as is
 *  IgnoreMarkup   -- for CDATA elements such as script, style
 */
 typedef enum
 {
  IgnoreWhitespace,
  MixedContent,
  Preformatted,
  IgnoreMarkup,
  OtherNamespace,
  CdataContent
 } GetTokenMode;
-/* choose what version to use for new doctype */
+
 /** @name Lexer Functions
 *  @{
 */
 /**
 *  Choose what version to use for new doctype
 */
 TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc );
 /* everything is allowed in proprietary version of HTML */
 /* this is handled here rather than in the tag/attr dicts */
 /**
 *  Everything is allowed in proprietary version of HTML.
 *  This is handled here rather than in the tag/attr dicts
 */
 TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
 TY_PRIVATE Bool TY_(IsWhite)(uint c);
@ -399,7 +429,6 @@ TY_PRIVATE Bool TY_(IsNamechar)(uint c);
 TY_PRIVATE Bool TY_(IsXMLLetter)(uint c);
 TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c);
 /* Bool IsLower(uint c); */
 TY_PRIVATE Bool TY_(IsUpper)(uint c);
 TY_PRIVATE uint TY_(ToLower)(uint c);
 TY_PRIVATE uint TY_(ToUpper)(uint c);
@ -407,60 +436,82 @@ TY_PRIVATE uint TY_(ToUpper)(uint c);
 TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc );
 TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc );
-/* store character c as UTF-8 encoded byte stream */
+
 /**
 *  Store character c as UTF-8 encoded byte stream
 */
 TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c );
 /*
  Used for elements and text nodes
  element name is NULL for text nodes
  start and end are offsets into lexbuf
  which contains the textual content of
  all elements in the parse tree.
-  parent and content allow traversal
+/**
-  of the parse tree in any direction.
+ *  Used for elements and text nodes.
-  attributes are represented as a linked
+ *   - Element name is NULL for text nodes.
-  list of AttVal nodes which hold the
+ *   - start and end are offsets into lexbuf,
-  strings for attribute/value pairs.
+ *     which contains the textual content of
 *     all elements in the parse tree.
 *   - parent and content allow traversal
 *     of the parse tree in any direction.
 *   - attributes are represented as a linked
 *     list of AttVal nodes which hold the
 *     strings for attribute/value pairs.
 */
 TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
-/* used to clone heading nodes when split by an <HR> */
+/**
 *  Used to clone heading nodes when split by an `<HR>`
 */
 TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
-/* free node's attributes */
+
 /**
 *  Free node's attributes
 */
 TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
-/* doesn't repair attribute list linkage */
+
 /**
 *  Doesn't repair attribute list linkage
 */
 TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
-/* detach attribute from node */
+
 /**
 * Detach attribute from node
 */
 TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr );
-/* detach attribute from node then free it
+
-*/
+/**
 *  Detach attribute from node then free it.
 */
 TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
-/*
+
-  Free document nodes by iterating through peers and recursing
+/**
-  through children. Set next to NULL before calling FreeNode()
+ *  Free document nodes by iterating through peers and recursing
-  to avoid freeing peer nodes. Doesn't patch up prev/next links.
+ *  through children. Set `next` to `NULL` before calling `FreeNode()`
 *  to avoid freeing peer nodes. Doesn't patch up prev/next links.
 */
 TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
 TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer );
-/* used for creating preformatted text from Word2000 */
+
 /**
 *  Used for creating preformatted text from Word2000.
 */
 TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer );
-/* used for adding a &nbsp; for Word2000 */
+
 /**
 *  Used for adding a &nbsp; for Word2000.
 */
 TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
 TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
 /* TY_PRIVATE void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
-/* find element */
+TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
 TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc );
 TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc );
 TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc );
@ -468,10 +519,16 @@ TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc);
 TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc );
 TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
-/* Returns containing block element, if any */
+
 /**
 *  Returns containing block element, if any
 */
 TY_PRIVATE Node* TY_(FindContainer)( Node* node );
-/* add meta element for Tidy */
+
 /**
 *  Add meta element for Tidy.
 */
 TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc );
 TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc );
@ -485,118 +542,209 @@ TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
 TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
-/* fixup doctype if missing */
+/**
 *  Fixup doctype if missing.
 */
 TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc );
-/* ensure XML document starts with <?xml version="1.0"?> */
+
-/* add encoding attribute if not using ASCII or UTF-8 output */
+/**
 *  Ensure XML document starts with <?xml version="1.0"?>,and
 *  add encoding attribute if not using ASCII or UTF-8 output.
 */
 TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
 TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
 TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc );
 /*
  modes for GetToken()
  MixedContent   -- for elements which don't accept PCDATA
  Preformatted   -- white space preserved as is
  IgnoreMarkup   -- for CDATA elements such as script, style
 */
 typedef enum
 {
  IgnoreWhitespace,
  MixedContent,
  Preformatted,
  IgnoreMarkup,
  OtherNamespace,
  CdataContent
 } GetTokenMode;
 TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
 TY_PRIVATE void TY_(InitMap)(void);
-/* create a new attribute */
+/**
 *  Create a new attribute.
 */
 TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
-/* create a new attribute with given name and value */
+
 /**
 *  Create a new attribute with given name and value.
 */
 TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
                             int delim );
-/* insert attribute at the end of attribute list of a node */
+
 /**
 *  Insert attribute at the end of attribute list of a node.
 */
 TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
-/* insert attribute at the start of attribute list of a node */
+/**
 *  Insert attribute at the start of attribute list of a node.
 */
 TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
-/*************************************
+
-  In-line Stack functions
+/** @}
-*************************************/
+ *  @name Inline Stack Functions
 *  @{
 */
-/* duplicate attributes */
+/**
 *  Duplicate attributes.
 */
 TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
 /*
  push a copy of an inline node onto stack
  but don't push if implicit or OBJECT or APPLET
  (implicit tags are ones generated from the istack)
-  One issue arises with pushing inlines when
+/**
-  the tag is already pushed. For instance:
+ *  Push a copy of an inline node onto stack, but don't push if
-
+ *  implicit or OBJECT or APPLET (implicit tags are ones generated
-      <p><em>text
+ *  from the istack).
-      <p><em>more text
+ *
-
+ *  One issue arises with pushing inlines when the tag is already pushed.
-  Shouldn't be mapped to
+ *  For instance:
-
+ *    ~~~
-      <p><em>text</em></p>
+ *    <p><em>text
-      <p><em><em>more text</em></em>
+ *    <p><em>more text
-*/
+ *    ~~~
 *  Shouldn't be mapped to
 *    ~~~
 *    <p><em>text</em></p>
 *    <p><em><em>more text</em></em>
 *    ~~~
 */
 TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node );
-/* pop inline stack */
+
 /**
 * Pop inline stack.
 */
 TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node );
 TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
 TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
 /*
  This has the effect of inserting "missing" inline
  elements around the contents of blocklevel elements
  such as P, TD, TH, DIV, PRE etc. This procedure is
  called at the start of ParseBlock. when the inline
  stack is not empty, as will be the case in:
-    <i><h1>italic heading</h1></i>
+/**
-
+ *  This has the effect of inserting "missing" inline elements around the
-  which is then treated as equivalent to
+ *  contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This
-
+ *  procedure is called at the start of `ParseBlock`, when the inline
-    <h1><i>italic heading</i></h1>
+ *  stack is not empty, as will be the case in:
-
+ *    ~~~
-  This is implemented by setting the lexer into a mode
+ *    <i><h1>italic heading</h1></i>
-  where it gets tokens from the inline stack rather than
+ *    ~~~
-  from the input stream.
+ *  which is then treated as equivalent to
-*/
+ *    ~~~
 *    <h1><i>italic heading</i></h1>
 *    ~~~
 *  This is implemented by setting the lexer into a mode where it gets
 *  tokens from the inline stack rather than from the input stream.
 */
 TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
-/*
+
- defer duplicates when entering a table or other
+/**
- element where the inlines shouldn't be duplicated
+ *  Fefer duplicates when entering a table or other
-*/
+ *  element where the inlines shouldn't be duplicated.
 */
 TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
 TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc );
-/* stack manipulation for inline elements */
+/**
 *  Stack manipulation for inline elements
 */
 TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
 TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
 /** @}
 *  @name Generic stack of nodes.
 *  @{
 */
 /**
 * This typedef represents a stack of addresses to nodes. Tidy uses these to
 * try to limit recursion by pushing nodes to a stack when possible instead
 * of recursing.
 */
 typedef struct _Stack {
    int top;                        /**< Current top position. */
    unsigned capacity;              /**< Current capacity. Can be expanded. */
    Node **firstNode;               /** A pointer to the first pointer to a Node in an array of node addresses. */
    TidyAllocator* allocator;       /**< Tidy's allocator, used at instantiation and expanding. */
 } Stack;
 /**
 * Create a new stack with a given starting capacity. If memory allocation
 * fails, then the allocator will panic the program automatically.
 */
 TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity);
 /**
 *  Increase the stack size. This will be called automatically when the
 *  current stack is full. If memory allocation fails, then the allocator
 *  will panic the program automatically.
 */
 TY_PRIVATE void TY_(growStack)(Stack *stack);
 /**
 * Stack is full when top is equal to the last index.
 */
 TY_PRIVATE Bool TY_(stackFull)(Stack *stack);
 /**
 * Stack is empty when top is equal to -1
 */
 TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack);
 /**
 * Push an item to the stack.
 */
 TY_PRIVATE void TY_(push)(Stack *stack, Node *node);
 /**
 * Pop an item from the stack.
 */
 TY_PRIVATE Node* TY_(pop)(Stack *stack);
 /**
 * Peek at the stack.
 */
 TY_PRIVATE Node* TY_(peek)(Stack *stack);
 /**
 *  Frees the stack when done.
 */
 TY_PRIVATE void TY_(freeStack)(Stack *stack);
 /** @}
 */
 #ifdef __cplusplus
 }
 #endif
 /** @} end parser_h group */
 /** @} end internal_api group */
 #endif /* __LEXER_H__ */
--- a/src/parser.c
+++ b/src/parser.c
--- a/src/parser.h
+++ b/src/parser.h
@ -41,6 +41,74 @@
 ******************************************************************************/
 /**
 *  The parsers keeps track of their states with the states defined here, and
 *  use these symbols when pushing to the stack so that they can later recreate
 *  their environments when re-entered.
 */
 typedef enum {
    /* Universal states. */
    STATE_INITIAL,             /**< This is the initial state for every parser. */
    STATE_COMPLETE,            /**< Complete! */
    STATE_PARSE_TAG,
    STATE_PARSE_TAG_DONE,
    /* ParseHTML states. */
    STATE_PRE_HEAD,            /**< In this state, we've not detected head yet. */
    STATE_PRE_BODY,            /**< In this state, we'll consider frames vs. body. */
    STATE_PARSE_BODY,          /**< In this state, we can parse the body. */
    STATE_PARSE_HEAD,          /**< In this state, we will setup head for parsing. */
    STATE_PARSE_HEAD_DONE,     /**< Resume here after parsing head. */
    STATE_PARSE_NOFRAMES,      /**< In this state, we can parse noframes content. */
    STATE_PARSE_NOFRAMES_DONE, /**< In this state, we can restore more state. */
    STATE_PARSE_FRAMESET,      /**< In this state, we will parse frameset content. */
    STATE_PARSE_FRAMESET_DONE, /**< We need to cleanup some things after parsing frameset. */
 } parserState;
 /**
 *  This typedef represents the state of a parser when it enters and exits.
 *  When the parser needs to finish work on the way back up the stack, it will
 *  push one of these records to the stack, and it will pop a record from the
 *  stack upon re-entry.
 */
 typedef struct _TidyParserMemory
 {
    Parser       *identity;      /**< Which parser pushed this record? */
    Node         *original_node; /**< Originally provided node at entry. */
    Node         *reentry_node;  /**< A node a parser might want to save. */
    GetTokenMode reentry_mode;   /**< The mode to use for the next node. */
    parserState  reentry_state;  /**< State to set during re-entry. */
    GetTokenMode mode;           /**< The caller will peek at this value to get the correct mode. */
 } TidyParserMemory;
 /**
 *  This typedef represents a stack of parserState. The Tidy document has its
 *  own instance of this.
 */
 typedef struct _TidyParserStack
 {
    TidyParserMemory* content;    /**< A state record. */
    TidyAllocator* allocator;     /**< The allocator used for creating. */
    uint size;                    /**< Current size of the stack. */
    int top;                      /**< Top of the stack. */
 } TidyParserStack;
 /**
 *  Allocates and initializes the parser's stack. TidyCreate will perform
 *  this automatically.
 */
 void TY_(InitParserStack)( TidyDocImpl* doc );
 /**
 *  Frees the parser's stack when done. TidyRelease will perform this
 *  automatically.
 */
 void TY_(FreeParserStack)( TidyDocImpl* doc );
 /**
 *  Is used to perform a node integrity check recursively after parsing
 *  an HTML or XML document.
@ -96,7 +164,7 @@ TY_PRIVATE Node *TY_(RemoveNode)(Node *node);
 /**
 *  Remove node from markup tree and discard it.
- *  @param doc The Tidy document from which to discarb the node.
+ *  @param doc The Tidy document from which to discard the node.
 *  @param element The node to discard.
 *  @returns Returns the next node.
 */
@ -202,4 +270,3 @@ TY_PRIVATE void TY_(ParseXMLDocument)( TidyDocImpl* doc );
 /** @} end internal_api group */
 #endif /* __PARSER_H__ */
--- a/src/tags.c
+++ b/src/tags.c
@ -168,7 +168,7 @@ static CheckAttribs CheckHTML;
 \*/
 static Dict tag_defs[] =
 {
-  { TidyTag_UNKNOWN,    "unknown!",   VERS_UNKNOWN,         NULL,                       (0),                                           NULL,          NULL           },
+  { TidyTag_UNKNOWN,    "unknown!",   VERS_UNKNOWN,         NULL,                            (0),                                           NULL,               NULL           },
  /* W3C defined elements */
  { TidyTag_A,          "a",          VERS_ELEM_A,          &TY_(W3CAttrsFor_A)[0],          (CM_INLINE|CM_BLOCK|CM_MIXED),                 TY_(ParseBlock),    NULL           }, /* Issue #167 & #169 - default HTML5 */
@ -332,7 +332,7 @@ static Dict tag_defs[] =
  { TidyTag_WBR,         "wbr",          VERS_ELEM_WBR,         &TY_(W3CAttrsFor_WBR)[0],         (CM_INLINE|CM_EMPTY),          TY_(ParseEmpty),     NULL           },
  /* this must be the final entry */
-  { (TidyTagId)0,        NULL,         0,                    NULL,                       (0),                                           NULL,          NULL           }
+  { (TidyTagId)0,        NULL,           0,                     NULL,                             (0),                           NULL,                NULL           }
 };
 static uint tagsHash(ctmbstr s)
--- a/src/tags.h
+++ b/src/tags.h
@ -61,8 +61,13 @@ typedef enum
 /** This typedef describes a function to be used to parse HTML of a Tidy tag.
 ** @param doc The Tidy document.
 ** @param node The node being parsed.
 ** @param mode The GetTokenMode to be used for parsing the node contents.
 ** @param popStack A flag indicating that we are re-entering this parser, and
 **   it should restore a state from the stack.
 */
-typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
+typedef Node* (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode, Bool popStack );
 /** This typedef describes a function be be used to check the attributes
--- a/src/tidy-int.h
+++ b/src/tidy-int.h
@ -16,6 +16,7 @@
 #include "pprint.h"
 #include "access.h"
 #include "message.h"
 #include "parser.h"
 #ifndef MAX
 #define MAX(a,b) (((a) > (b))?(a):(b))
@ -41,19 +42,20 @@ struct _TidyDocImpl
    Lexer*              lexer;
    /* Config + Markup Declarations */
-    TidyConfigImpl          config;
+    TidyConfigImpl           config;
-    TidyTagImpl             tags;
+    TidyTagImpl              tags;
-    TidyAttribImpl          attribs;
+    TidyAttribImpl           attribs;
-    TidyAccessImpl          access;
+    TidyAccessImpl           access;
-    TidyMutedMessages       muted;
+    TidyMutedMessages        muted;
    /* The Pretty Print buffer */
-    TidyPrintImpl       pprint;
+    TidyPrintImpl            pprint;
    /* I/O */
    StreamIn*                docIn;
    StreamOut*               docOut;
    StreamOut*               errout;
    TidyReportFilter         reportFilter;
    TidyReportCallback       reportCallback;
    TidyMessageCallback      messageCallback;
@ -62,6 +64,8 @@ struct _TidyDocImpl
    TidyConfigChangeCallback pConfigChangeCallback;
    TidyPPProgress           progressCallback;
    TidyParserStack          stack;
    /* Parse + Repair Results */
    uint                optionErrors;
    uint                errors;
--- a/src/tidylib.c
+++ b/src/tidylib.c
@ -112,6 +112,7 @@ TidyDocImpl* tidyDocCreate( TidyAllocator *allocator )
    TY_(InitAttrs)( doc );
    TY_(InitConfig)( doc );
    TY_(InitPrintBuf)( doc );
    TY_(InitParserStack)( doc );
    /* Set the locale for tidy's output. This both configures
    ** LibTidy to use the environment's locale as well as the
@ -172,6 +173,7 @@ void          tidyDocRelease( TidyDocImpl* doc )
         *  to determine which hash is to be used, so free it last.
        \*/
        TY_(FreeLexer)( doc );
        TY_(FreeParserStack)( doc );
        TidyDocFree( doc, doc );
    }
 }