a873a190e1
Improved debug output.
750 lines
21 KiB
C
750 lines
21 KiB
C
#ifndef __LEXER_H__
|
|
#define __LEXER_H__
|
|
|
|
|
|
/**************************************************************************//**
|
|
* @file
|
|
* Lexer for HTML and XML Parsers.
|
|
*
|
|
* Given an input source, it returns a sequence of tokens.
|
|
*
|
|
* GetToken(source) gets the next token
|
|
* UngetToken(source) provides one level undo
|
|
*
|
|
* The tags include an attribute list:
|
|
*
|
|
* - linked list of attribute/value nodes
|
|
* - each node has 2 NULL-terminated strings.
|
|
* - entities are replaced in attribute values
|
|
*
|
|
* white space is compacted if not in preformatted mode
|
|
* If not in preformatted mode then leading white space
|
|
* is discarded and subsequent white space sequences
|
|
* compacted to single space characters.
|
|
*
|
|
* If XmlTags is no then Tag names are folded to upper
|
|
* case and attribute names to lower case.
|
|
*
|
|
* Not yet done:
|
|
* - Doctype subset and marked sections
|
|
*
|
|
* @author HTACG, et al (consult git log)
|
|
*
|
|
* @copyright
|
|
* (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG.
|
|
* See tidy.h for the copyright notice.
|
|
* @par
|
|
* All Rights Reserved.
|
|
* @par
|
|
* See `tidy.h` for the complete license.
|
|
*
|
|
* @date Additional updates: consult git log
|
|
*
|
|
******************************************************************************/
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
#include "forward.h"
|
|
|
|
/** @addtogroup internal_api */
|
|
/** @{ */
|
|
|
|
|
|
/***************************************************************************//**
|
|
** @defgroup lexer_h HTML and XML Lexing
|
|
**
|
|
** These functions and structures form the internal API for document
|
|
** lexing.
|
|
**
|
|
** @{
|
|
******************************************************************************/
|
|
|
|
|
|
/**
|
|
* Lexer character types.
|
|
*/
|
|
#define digit 1u
|
|
#define letter 2u
|
|
#define namechar 4u
|
|
#define white 8u
|
|
#define newline 16u
|
|
#define lowercase 32u
|
|
#define uppercase 64u
|
|
#define digithex 128u
|
|
|
|
|
|
/**
|
|
* node->type is one of these values
|
|
*/
|
|
typedef enum
|
|
{
|
|
RootNode,
|
|
DocTypeTag,
|
|
CommentTag,
|
|
ProcInsTag,
|
|
TextNode,
|
|
StartTag,
|
|
EndTag,
|
|
StartEndTag,
|
|
CDATATag,
|
|
SectionTag,
|
|
AspTag,
|
|
JsteTag,
|
|
PhpTag,
|
|
XmlDecl
|
|
} NodeType;
|
|
|
|
|
|
/**
|
|
* Lexer GetToken() states.
|
|
*/
|
|
typedef enum
|
|
{
|
|
LEX_CONTENT,
|
|
LEX_GT,
|
|
LEX_ENDTAG,
|
|
LEX_STARTTAG,
|
|
LEX_COMMENT,
|
|
LEX_DOCTYPE,
|
|
LEX_PROCINSTR,
|
|
LEX_CDATA,
|
|
LEX_SECTION,
|
|
LEX_ASP,
|
|
LEX_JSTE,
|
|
LEX_PHP,
|
|
LEX_XMLDECL
|
|
} LexerState;
|
|
|
|
|
|
/**
|
|
* ParseDocTypeDecl state constants.
|
|
*/
|
|
typedef enum
|
|
{
|
|
DT_INTERMEDIATE,
|
|
DT_DOCTYPENAME,
|
|
DT_PUBLICSYSTEM,
|
|
DT_QUOTEDSTRING,
|
|
DT_INTSUBSET
|
|
} ParseDocTypeDeclState;
|
|
|
|
|
|
/**
|
|
* Content model shortcut encoding.
|
|
* Descriptions are tentative.
|
|
*/
|
|
#define CM_UNKNOWN 0
|
|
#define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */
|
|
#define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */
|
|
#define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */
|
|
#define CM_BLOCK (1 << 3) /**< HTML "block" elements. */
|
|
#define CM_INLINE (1 << 4) /**< HTML "inline" elements. */
|
|
#define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */
|
|
#define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */
|
|
#define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */
|
|
#define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */
|
|
#define CM_ROW (1 << 9) /**< Used for "TD", "TH" */
|
|
#define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */
|
|
#define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */
|
|
#define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */
|
|
#define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
|
|
#define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */
|
|
#define CM_OPT (1 << 15) /**< Elements with an optional end tag. */
|
|
#define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */
|
|
#define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */
|
|
#define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */
|
|
#define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */
|
|
#define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */
|
|
#define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */
|
|
#define CM_VOID (1 << 22) /**< Elements that are void per https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#syntax-elements. */
|
|
|
|
|
|
/**
|
|
* If the document uses just HTML 2.0 tags and attributes described
|
|
* it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
|
|
* If there are proprietary tags and attributes then describe it as
|
|
* HTML Proprietary. If it includes the xml-lang or xmlns attributes
|
|
* but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
|
|
* flavors of Voyager (strict, loose or frameset).
|
|
*/
|
|
|
|
/* unknown */
|
|
#define xxxx 0u
|
|
|
|
/* W3C defined HTML/XHTML family document types */
|
|
#define HT20 1u
|
|
#define HT32 2u
|
|
#define H40S 4u
|
|
#define H40T 8u
|
|
#define H40F 16u
|
|
#define H41S 32u
|
|
#define H41T 64u
|
|
#define H41F 128u
|
|
#define X10S 256u
|
|
#define X10T 512u
|
|
#define X10F 1024u
|
|
#define XH11 2048u
|
|
#define XB10 4096u
|
|
|
|
/* proprietary stuff */
|
|
#define VERS_SUN 8192u
|
|
#define VERS_NETSCAPE 16384u
|
|
#define VERS_MICROSOFT 32768u
|
|
|
|
/* special flag */
|
|
#define VERS_XML 65536u
|
|
|
|
/* HTML5 */
|
|
#define HT50 131072u
|
|
#define XH50 262144u
|
|
|
|
/* compatibility symbols */
|
|
#define VERS_UNKNOWN (xxxx)
|
|
#define VERS_HTML20 (HT20)
|
|
#define VERS_HTML32 (HT32)
|
|
#define VERS_HTML40_STRICT (H40S|H41S|X10S)
|
|
#define VERS_HTML40_LOOSE (H40T|H41T|X10T)
|
|
#define VERS_FRAMESET (H40F|H41F|X10F)
|
|
#define VERS_XHTML11 (XH11)
|
|
#define VERS_BASIC (XB10)
|
|
/* HTML5 */
|
|
#define VERS_HTML5 (HT50|XH50)
|
|
|
|
/* meta symbols */
|
|
#define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
|
|
#define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET)
|
|
#define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
|
|
#define VERS_EVENTS (VERS_HTML40|VERS_XHTML11)
|
|
#define VERS_FROM32 (VERS_HTML32|VERS_HTML40|HT50)
|
|
#define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC|VERS_HTML5)
|
|
#define VERS_XHTML (X10S|X10T|X10F|XH11|XB10|XH50)
|
|
|
|
/* strict */
|
|
#define VERS_STRICT (VERS_HTML5|VERS_HTML40_STRICT)
|
|
|
|
/* all W3C defined document types */
|
|
#define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50)
|
|
|
|
/* all proprietary types */
|
|
#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
|
|
|
|
|
|
/**
|
|
* Linked list of class names and styles
|
|
*/
|
|
struct _Style;
|
|
typedef struct _Style TagStyle;
|
|
|
|
struct _Style
|
|
{
|
|
tmbstr tag;
|
|
tmbstr tag_class;
|
|
tmbstr properties;
|
|
TagStyle *next;
|
|
};
|
|
|
|
|
|
/**
|
|
* Linked list of style properties
|
|
*/
|
|
struct _StyleProp;
|
|
typedef struct _StyleProp StyleProp;
|
|
|
|
struct _StyleProp
|
|
{
|
|
tmbstr name;
|
|
tmbstr value;
|
|
StyleProp *next;
|
|
};
|
|
|
|
|
|
/**
|
|
* Attribute/Value linked list node
|
|
*/
|
|
struct _AttVal
|
|
{
|
|
AttVal* next;
|
|
const Attribute* dict;
|
|
Node* asp;
|
|
Node* php;
|
|
int delim;
|
|
tmbstr attribute;
|
|
tmbstr value;
|
|
};
|
|
|
|
|
|
/**
|
|
* Mosaic handles inlines via a separate stack from other elements
|
|
* We duplicate this to recover from inline markup errors such as:
|
|
* ~~~
|
|
* <i>italic text
|
|
* <p>more italic text</b> normal text
|
|
* ~~~
|
|
* which for compatibility with Mosaic is mapped to:
|
|
* ~~~
|
|
* <i>italic text</i>
|
|
* <p><i>more italic text</i> normal text
|
|
* ~~~
|
|
* Note that any inline end tag pop's the effect of the current
|
|
* inline start tag, so that `</b>` pop's `<i>` in the above example.
|
|
*/
|
|
struct _IStack
|
|
{
|
|
IStack* next;
|
|
const Dict* tag; /**< tag's dictionary definition */
|
|
tmbstr element; /**< name (NULL for text nodes) */
|
|
AttVal* attributes;
|
|
};
|
|
|
|
|
|
/**
|
|
* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.
|
|
*/
|
|
struct _Node
|
|
{
|
|
Node* parent; /**< tree structure */
|
|
Node* prev;
|
|
Node* next;
|
|
Node* content;
|
|
Node* last;
|
|
|
|
AttVal* attributes;
|
|
const Dict* was; /**< old tag when it was changed */
|
|
const Dict* tag; /**< tag's dictionary definition */
|
|
|
|
tmbstr element; /**< name (NULL for text nodes) */
|
|
|
|
uint start; /**< start of span onto text array */
|
|
uint end; /**< end of span onto text array */
|
|
NodeType type; /**< TextNode, StartTag, EndTag etc. */
|
|
|
|
uint line; /**< current line of document */
|
|
uint column; /**< current column of document */
|
|
|
|
int idx; /**< general purpose register */
|
|
|
|
Bool closed; /**< true if closed by explicit end tag */
|
|
Bool implicit; /**< true if inferred */
|
|
Bool linebreak; /**< true if followed by a line break */
|
|
};
|
|
|
|
|
|
/**
|
|
* The following are private to the lexer.
|
|
* Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it.
|
|
*/
|
|
struct _Lexer
|
|
{
|
|
uint lines; /**< lines seen */
|
|
uint columns; /**< at start of current token */
|
|
Bool waswhite; /**< used to collapse contiguous white space */
|
|
Bool pushed; /**< true after token has been pushed back */
|
|
Bool insertspace; /**< when space is moved after end tag */
|
|
Bool excludeBlocks; /**< Netscape compatibility */
|
|
Bool exiled; /**< true if moved out of table */
|
|
Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */
|
|
uint versions; /**< bit vector of HTML versions */
|
|
uint doctype; /**< version as given by doctype (if any) */
|
|
uint versionEmitted; /**< version of doctype emitted */
|
|
Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */
|
|
uint txtstart; /**< start of current node */
|
|
uint txtend; /**< end of current node */
|
|
LexerState state; /**< state of lexer's finite state machine */
|
|
|
|
Node* token; /**< last token returned by GetToken() */
|
|
Node* itoken; /**< last duplicate inline returned by GetToken() */
|
|
Node* root; /**< remember root node of the document */
|
|
Node* parent; /**< remember parent node for CDATA elements */
|
|
|
|
Bool seenEndBody; /**< true if a `</body>` tag has been encountered */
|
|
Bool seenEndHtml; /**< true if a `</html>` tag has been encountered */
|
|
|
|
/*
|
|
Lexer character buffer
|
|
|
|
Parse tree nodes span onto this buffer
|
|
which contains the concatenated text
|
|
contents of all of the elements.
|
|
|
|
lexsize must be reset for each file.
|
|
*/
|
|
tmbstr lexbuf; /**< MB character buffer */
|
|
uint lexlength; /**< allocated */
|
|
uint lexsize; /**< used */
|
|
|
|
/* Inline stack for compatibility with Mosaic */
|
|
Node* inode; /**< for deferring text node */
|
|
IStack* insert; /**< for inferring inline tags */
|
|
IStack* istack;
|
|
uint istacklength; /**< allocated */
|
|
uint istacksize; /**< used */
|
|
uint istackbase; /**< start of frame */
|
|
|
|
TagStyle *styles; /**< used for cleaning up presentation markup */
|
|
|
|
TidyAllocator* allocator; /**< allocator */
|
|
};
|
|
|
|
|
|
/**
|
|
* modes for GetToken()
|
|
*/
|
|
typedef enum
|
|
{
|
|
IgnoreWhitespace, /**< */
|
|
MixedContent, /**< for elements which don't accept PCDATA */
|
|
Preformatted, /**< white space preserved as is */
|
|
IgnoreMarkup, /**< for CDATA elements such as script, style */
|
|
OtherNamespace, /**< */
|
|
CdataContent /**< */
|
|
} GetTokenMode;
|
|
|
|
|
|
/** @name Lexer Functions
|
|
* @{
|
|
*/
|
|
|
|
|
|
/**
|
|
* Choose what version to use for new doctype
|
|
*/
|
|
TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc );
|
|
|
|
|
|
/**
|
|
* Everything is allowed in proprietary version of HTML.
|
|
* This is handled here rather than in the tag/attr dicts
|
|
*/
|
|
TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
|
|
|
|
TY_PRIVATE Bool TY_(IsWhite)(uint c);
|
|
TY_PRIVATE Bool TY_(IsDigit)(uint c);
|
|
TY_PRIVATE Bool TY_(IsLetter)(uint c);
|
|
TY_PRIVATE Bool TY_(IsHTMLSpace)(uint c);
|
|
TY_PRIVATE Bool TY_(IsNewline)(uint c);
|
|
TY_PRIVATE Bool TY_(IsNamechar)(uint c);
|
|
TY_PRIVATE Bool TY_(IsXMLLetter)(uint c);
|
|
TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c);
|
|
|
|
TY_PRIVATE Bool TY_(IsUpper)(uint c);
|
|
TY_PRIVATE uint TY_(ToLower)(uint c);
|
|
TY_PRIVATE uint TY_(ToUpper)(uint c);
|
|
|
|
TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc );
|
|
TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc );
|
|
|
|
|
|
/**
|
|
* Store character c as UTF-8 encoded byte stream
|
|
*/
|
|
TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c );
|
|
|
|
|
|
/**
|
|
* Used for elements and text nodes.
|
|
* - Element name is NULL for text nodes.
|
|
* - start and end are offsets into lexbuf,
|
|
* which contains the textual content of
|
|
* all elements in the parse tree.
|
|
* - parent and content allow traversal
|
|
* of the parse tree in any direction.
|
|
* - attributes are represented as a linked
|
|
* list of AttVal nodes which hold the
|
|
* strings for attribute/value pairs.
|
|
*/
|
|
TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
|
|
|
|
|
|
/**
|
|
* Used to clone heading nodes when split by an `<HR>`
|
|
*/
|
|
TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
|
|
|
|
|
|
/**
|
|
* Free node's attributes
|
|
*/
|
|
TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
|
|
|
|
|
|
/**
|
|
* Doesn't repair attribute list linkage
|
|
*/
|
|
TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
|
|
|
|
|
|
/**
|
|
* Detach attribute from node
|
|
*/
|
|
TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr );
|
|
|
|
|
|
/**
|
|
* Detach attribute from node then free it.
|
|
*/
|
|
TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
|
|
|
|
|
|
/**
|
|
* Free document nodes by iterating through peers and recursing
|
|
* through children. Set `next` to `NULL` before calling `FreeNode()`
|
|
* to avoid freeing peer nodes. Doesn't patch up prev/next links.
|
|
*/
|
|
TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
|
|
|
|
|
|
TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer );
|
|
|
|
|
|
/**
|
|
* Used for creating preformatted text from Word2000.
|
|
*/
|
|
TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer );
|
|
|
|
|
|
/**
|
|
* Used for adding a for Word2000.
|
|
*/
|
|
TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
|
|
|
|
|
|
TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
|
|
TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc );
|
|
TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc );
|
|
TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc );
|
|
TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc);
|
|
TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc );
|
|
TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
|
|
|
|
|
|
/**
|
|
* Returns containing block element, if any
|
|
*/
|
|
TY_PRIVATE Node* TY_(FindContainer)( Node* node );
|
|
|
|
|
|
/**
|
|
* Add meta element for Tidy.
|
|
*/
|
|
TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc );
|
|
|
|
TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc );
|
|
|
|
TY_PRIVATE ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml );
|
|
|
|
TY_PRIVATE uint TY_(HTMLVersionNumberFromCode)( uint vers );
|
|
|
|
TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
|
|
|
|
TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
|
|
|
|
|
|
/**
|
|
* Fixup doctype if missing.
|
|
*/
|
|
TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc );
|
|
|
|
|
|
/**
|
|
* Ensure XML document starts with <?xml version="1.0"?>,and
|
|
* add encoding attribute if not using ASCII or UTF-8 output.
|
|
*/
|
|
TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
|
|
|
|
|
|
TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
|
|
|
|
TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc );
|
|
|
|
TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
|
|
|
|
TY_PRIVATE void TY_(InitMap)(void);
|
|
|
|
|
|
/**
|
|
* Create a new attribute.
|
|
*/
|
|
TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
|
|
|
|
|
|
/**
|
|
* Create a new attribute with given name and value.
|
|
*/
|
|
TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
|
|
int delim );
|
|
|
|
|
|
/**
|
|
* Insert attribute at the end of attribute list of a node.
|
|
*/
|
|
TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
|
|
|
|
/**
|
|
* Insert attribute at the start of attribute list of a node.
|
|
*/
|
|
TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
|
|
|
|
|
|
/** @}
|
|
* @name Inline Stack Functions
|
|
* @{
|
|
*/
|
|
|
|
|
|
/**
|
|
* Duplicate attributes.
|
|
*/
|
|
TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
|
|
|
|
|
|
/**
|
|
* Push a copy of an inline node onto stack, but don't push if
|
|
* implicit or OBJECT or APPLET (implicit tags are ones generated
|
|
* from the istack).
|
|
*
|
|
* One issue arises with pushing inlines when the tag is already pushed.
|
|
* For instance:
|
|
* ~~~
|
|
* <p><em>text
|
|
* <p><em>more text
|
|
* ~~~
|
|
* Shouldn't be mapped to
|
|
* ~~~
|
|
* <p><em>text</em></p>
|
|
* <p><em><em>more text</em></em>
|
|
* ~~~
|
|
*/
|
|
TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node );
|
|
|
|
|
|
/**
|
|
* Pop inline stack.
|
|
*/
|
|
TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node );
|
|
|
|
|
|
TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
|
|
TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
|
|
|
|
|
|
/**
|
|
* This has the effect of inserting "missing" inline elements around the
|
|
* contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This
|
|
* procedure is called at the start of `ParseBlock`, when the inline
|
|
* stack is not empty, as will be the case in:
|
|
* ~~~
|
|
* <i><h1>italic heading</h1></i>
|
|
* ~~~
|
|
* which is then treated as equivalent to
|
|
* ~~~
|
|
* <h1><i>italic heading</i></h1>
|
|
* ~~~
|
|
* This is implemented by setting the lexer into a mode where it gets
|
|
* tokens from the inline stack rather than from the input stream.
|
|
*/
|
|
TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
|
|
|
|
|
|
/**
|
|
* Defer duplicates when entering a table or other
|
|
* element where the inlines shouldn't be duplicated.
|
|
*/
|
|
TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
|
|
|
|
|
|
TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc );
|
|
|
|
/**
|
|
* Stack manipulation for inline elements
|
|
*/
|
|
TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
|
|
|
|
|
|
TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
|
|
|
|
|
|
/** @}
|
|
* @name Generic stack of nodes.
|
|
* @{
|
|
*/
|
|
|
|
|
|
/**
|
|
* This typedef represents a stack of addresses to nodes. Tidy uses these to
|
|
* try to limit recursion by pushing nodes to a stack when possible instead
|
|
* of recursing.
|
|
*/
|
|
typedef struct _Stack {
|
|
int top; /**< Current top position. */
|
|
unsigned capacity; /**< Current capacity. Can be expanded. */
|
|
Node **firstNode; /** A pointer to the first pointer to a Node in an array of node addresses. */
|
|
TidyAllocator* allocator; /**< Tidy's allocator, used at instantiation and expanding. */
|
|
} Stack;
|
|
|
|
|
|
/**
|
|
* Create a new stack with a given starting capacity. If memory allocation
|
|
* fails, then the allocator will panic the program automatically.
|
|
*/
|
|
TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity);
|
|
|
|
|
|
/**
|
|
* Increase the stack size. This will be called automatically when the
|
|
* current stack is full. If memory allocation fails, then the allocator
|
|
* will panic the program automatically.
|
|
*/
|
|
TY_PRIVATE void TY_(growStack)(Stack *stack);
|
|
|
|
|
|
/**
|
|
* Stack is full when top is equal to the last index.
|
|
*/
|
|
TY_PRIVATE Bool TY_(stackFull)(Stack *stack);
|
|
|
|
|
|
/**
|
|
* Stack is empty when top is equal to -1
|
|
*/
|
|
TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack);
|
|
|
|
|
|
/**
|
|
* Push an item to the stack.
|
|
*/
|
|
TY_PRIVATE void TY_(push)(Stack *stack, Node *node);
|
|
|
|
|
|
/**
|
|
* Pop an item from the stack.
|
|
*/
|
|
TY_PRIVATE Node* TY_(pop)(Stack *stack);
|
|
|
|
|
|
/**
|
|
* Peek at the stack.
|
|
*/
|
|
TY_PRIVATE Node* TY_(peek)(Stack *stack);
|
|
|
|
/**
|
|
* Frees the stack when done.
|
|
*/
|
|
TY_PRIVATE void TY_(freeStack)(Stack *stack);
|
|
|
|
|
|
/** @}
|
|
*/
|
|
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
|
|
/** @} end parser_h group */
|
|
/** @} end internal_api group */
|
|
|
|
#endif /* __LEXER_H__ */
|