2011-11-17 02:44:16 +00:00
# ifndef __LEXER_H__
# define __LEXER_H__
2021-07-28 23:45:57 +00:00
/**************************************************************************/ /**
* @ file
* Lexer for HTML and XML Parsers .
*
* Given an input source , it returns a sequence of tokens .
*
* GetToken ( source ) gets the next token
* UngetToken ( source ) provides one level undo
*
* The tags include an attribute list :
*
* - linked list of attribute / value nodes
* - each node has 2 NULL - terminated strings .
* - entities are replaced in attribute values
*
* white space is compacted if not in preformatted mode
* If not in preformatted mode then leading white space
* is discarded and subsequent white space sequences
* compacted to single space characters .
*
* If XmlTags is no then Tag names are folded to upper
* case and attribute names to lower case .
*
* Not yet done :
* - Doctype subset and marked sections
*
* @ author HTACG , et al ( consult git log )
*
* @ copyright
* ( c ) 1998 - 2021 ( W3C ) MIT , ERCIM , Keio University , and HTACG .
* See tidy . h for the copyright notice .
* @ par
* All Rights Reserved .
* @ par
* See ` tidy . h ` for the complete license .
*
* @ date Additional updates : consult git log
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-11-17 02:44:16 +00:00
# ifdef __cplusplus
extern " C " {
# endif
# include "forward.h"
2021-07-28 23:45:57 +00:00
/** @addtogroup internal_api */
/** @{ */
/***************************************************************************/ /**
* * @ defgroup lexer_h HTML and XML Lexing
* *
* * These functions and structures form the internal API for document
* * lexing .
* *
* * @ {
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* Lexer character types .
*/
2011-11-17 02:44:16 +00:00
# define digit 1u
# define letter 2u
# define namechar 4u
# define white 8u
# define newline 16u
# define lowercase 32u
# define uppercase 64u
# define digithex 128u
2021-07-28 23:45:57 +00:00
/**
* node - > type is one of these values
*/
2011-11-17 02:44:16 +00:00
typedef enum
{
RootNode ,
DocTypeTag ,
CommentTag ,
ProcInsTag ,
TextNode ,
StartTag ,
EndTag ,
StartEndTag ,
CDATATag ,
SectionTag ,
AspTag ,
JsteTag ,
PhpTag ,
XmlDecl
} NodeType ;
2021-07-28 23:45:57 +00:00
/**
* Lexer GetToken ( ) states .
*/
2011-11-17 02:44:16 +00:00
typedef enum
{
LEX_CONTENT ,
LEX_GT ,
LEX_ENDTAG ,
LEX_STARTTAG ,
LEX_COMMENT ,
LEX_DOCTYPE ,
LEX_PROCINSTR ,
LEX_CDATA ,
LEX_SECTION ,
LEX_ASP ,
LEX_JSTE ,
LEX_PHP ,
LEX_XMLDECL
} LexerState ;
2021-07-28 23:45:57 +00:00
/**
* ParseDocTypeDecl state constants .
*/
2011-11-17 02:44:16 +00:00
typedef enum
{
DT_INTERMEDIATE ,
DT_DOCTYPENAME ,
DT_PUBLICSYSTEM ,
DT_QUOTEDSTRING ,
DT_INTSUBSET
} ParseDocTypeDeclState ;
2021-07-28 23:45:57 +00:00
/**
* Content model shortcut encoding .
* Descriptions are tentative .
*/
2011-11-17 02:44:16 +00:00
# define CM_UNKNOWN 0
2021-07-28 23:45:57 +00:00
# define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */
# define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */
# define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */
# define CM_BLOCK (1 << 3) /**< HTML "block" elements. */
# define CM_INLINE (1 << 4) /**< HTML "inline" elements. */
# define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */
# define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */
# define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */
# define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */
# define CM_ROW (1 << 9) /**< Used for "TD", "TH" */
# define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */
# define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */
# define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */
# define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
# define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */
# define CM_OPT (1 << 15) /**< Elements with an optional end tag. */
# define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */
# define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */
# define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */
# define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */
# define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */
# define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */
/**
* If the document uses just HTML 2.0 tags and attributes described
* it is HTML 2.0 . Similarly for HTML 3.2 and the 3 flavors of HTML 4.0 .
* If there are proprietary tags and attributes then describe it as
* HTML Proprietary . If it includes the xml - lang or xmlns attributes
* but is otherwise HTML 2.0 , 3.2 or 4.0 then describe it as one of the
* flavors of Voyager ( strict , loose or frameset ) .
*/
2011-11-17 02:44:16 +00:00
/* unknown */
# define xxxx 0u
/* W3C defined HTML/XHTML family document types */
# define HT20 1u
# define HT32 2u
# define H40S 4u
# define H40T 8u
# define H40F 16u
# define H41S 32u
# define H41T 64u
# define H41F 128u
# define X10S 256u
# define X10T 512u
# define X10F 1024u
# define XH11 2048u
# define XB10 4096u
/* proprietary stuff */
# define VERS_SUN 8192u
# define VERS_NETSCAPE 16384u
# define VERS_MICROSOFT 32768u
/* special flag */
# define VERS_XML 65536u
2014-08-03 18:33:29 +00:00
/* HTML5 */
2011-11-17 02:44:16 +00:00
# define HT50 131072u
# define XH50 262144u
/* compatibility symbols */
# define VERS_UNKNOWN (xxxx)
# define VERS_HTML20 (HT20)
# define VERS_HTML32 (HT32)
# define VERS_HTML40_STRICT (H40S|H41S|X10S)
# define VERS_HTML40_LOOSE (H40T|H41T|X10T)
# define VERS_FRAMESET (H40F|H41F|X10F)
# define VERS_XHTML11 (XH11)
# define VERS_BASIC (XB10)
2014-08-03 18:33:29 +00:00
/* HTML5 */
# define VERS_HTML5 (HT50|XH50)
2011-11-17 02:44:16 +00:00
/* meta symbols */
# define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
# define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET)
# define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
# define VERS_EVENTS (VERS_HTML40|VERS_XHTML11)
2017-11-17 01:52:32 +00:00
# define VERS_FROM32 (VERS_HTML32|VERS_HTML40|HT50)
# define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC|VERS_HTML5)
2011-11-17 02:44:16 +00:00
# define VERS_XHTML (X10S|X10T|X10F|XH11|XB10|XH50)
2016-02-12 12:46:49 +00:00
/* strict */
# define VERS_STRICT (VERS_HTML5|VERS_HTML40_STRICT)
2011-11-17 02:44:16 +00:00
/* all W3C defined document types */
# define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50)
/* all proprietary types */
# define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
2021-07-28 23:45:57 +00:00
/**
* Linked list of class names and styles
*/
2011-11-17 02:44:16 +00:00
struct _Style ;
typedef struct _Style TagStyle ;
struct _Style
{
tmbstr tag ;
tmbstr tag_class ;
tmbstr properties ;
TagStyle * next ;
} ;
2021-07-28 23:45:57 +00:00
/**
* Linked list of style properties
*/
2011-11-17 02:44:16 +00:00
struct _StyleProp ;
typedef struct _StyleProp StyleProp ;
struct _StyleProp
{
tmbstr name ;
tmbstr value ;
StyleProp * next ;
} ;
2021-07-28 23:45:57 +00:00
/**
* Attribute / Value linked list node
*/
2011-11-17 02:44:16 +00:00
struct _AttVal
{
AttVal * next ;
const Attribute * dict ;
Node * asp ;
Node * php ;
int delim ;
tmbstr attribute ;
tmbstr value ;
} ;
2021-07-28 23:45:57 +00:00
/**
* Mosaic handles inlines via a separate stack from other elements
* We duplicate this to recover from inline markup errors such as :
* ~ ~ ~
* < i > italic text
* < p > more italic text < / b > normal text
* ~ ~ ~
* which for compatibility with Mosaic is mapped to :
* ~ ~ ~
* < i > italic text < / i >
* < p > < i > more italic text < / i > normal text
* ~ ~ ~
* Note that any inline end tag pop ' s the effect of the current
* inline start tag , so that ` < / b > ` pop ' s ` < i > ` in the above example .
2011-11-17 02:44:16 +00:00
*/
struct _IStack
{
IStack * next ;
2021-07-28 23:45:57 +00:00
const Dict * tag ; /**< tag's dictionary definition */
tmbstr element ; /**< name (NULL for text nodes) */
2011-11-17 02:44:16 +00:00
AttVal * attributes ;
} ;
2021-07-28 23:45:57 +00:00
/**
* HTML / XHTML / XML Element , Comment , PI , DOCTYPE , XML Decl , etc . , etc .
*/
2011-11-17 02:44:16 +00:00
struct _Node
{
2021-07-28 23:45:57 +00:00
Node * parent ; /**< tree structure */
2011-11-17 02:44:16 +00:00
Node * prev ;
Node * next ;
Node * content ;
Node * last ;
AttVal * attributes ;
2021-07-28 23:45:57 +00:00
const Dict * was ; /**< old tag when it was changed */
const Dict * tag ; /**< tag's dictionary definition */
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
tmbstr element ; /**< name (NULL for text nodes) */
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
uint start ; /**< start of span onto text array */
uint end ; /**< end of span onto text array */
NodeType type ; /**< TextNode, StartTag, EndTag etc. */
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
uint line ; /**< current line of document */
uint column ; /**< current column of document */
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
Bool closed ; /**< true if closed by explicit end tag */
Bool implicit ; /**< true if inferred */
Bool linebreak ; /**< true if followed by a line break */
2011-11-17 02:44:16 +00:00
} ;
2021-07-28 23:45:57 +00:00
/**
* The following are private to the lexer .
* Use ` NewLexer ( ) ` to create a lexer , and ` FreeLexer ( ) ` to free it .
*/
2011-11-17 02:44:16 +00:00
struct _Lexer
{
2021-07-28 23:45:57 +00:00
uint lines ; /**< lines seen */
uint columns ; /**< at start of current token */
Bool waswhite ; /**< used to collapse contiguous white space */
Bool pushed ; /**< true after token has been pushed back */
Bool insertspace ; /**< when space is moved after end tag */
Bool excludeBlocks ; /**< Netscape compatibility */
Bool exiled ; /**< true if moved out of table */
Bool isvoyager ; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */
uint versions ; /**< bit vector of HTML versions */
uint doctype ; /**< version as given by doctype (if any) */
uint versionEmitted ; /**< version of doctype emitted */
Bool bad_doctype ; /**< e.g. if html or PUBLIC is missing */
uint txtstart ; /**< start of current node */
uint txtend ; /**< end of current node */
LexerState state ; /**< state of lexer's finite state machine */
Node * token ; /**< last token returned by GetToken() */
Node * itoken ; /**< last duplicate inline returned by GetToken() */
Node * root ; /**< remember root node of the document */
Node * parent ; /**< remember parent node for CDATA elements */
Bool seenEndBody ; /**< true if a `</body>` tag has been encountered */
Bool seenEndHtml ; /**< true if a `</html>` tag has been encountered */
2011-11-17 02:44:16 +00:00
/*
Lexer character buffer
Parse tree nodes span onto this buffer
which contains the concatenated text
contents of all of the elements .
lexsize must be reset for each file .
*/
2021-07-28 23:45:57 +00:00
tmbstr lexbuf ; /**< MB character buffer */
uint lexlength ; /**< allocated */
uint lexsize ; /**< used */
2011-11-17 02:44:16 +00:00
/* Inline stack for compatibility with Mosaic */
2021-07-28 23:45:57 +00:00
Node * inode ; /**< for deferring text node */
IStack * insert ; /**< for inferring inline tags */
2011-11-17 02:44:16 +00:00
IStack * istack ;
2021-07-28 23:45:57 +00:00
uint istacklength ; /**< allocated */
uint istacksize ; /**< used */
uint istackbase ; /**< start of frame */
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
TagStyle * styles ; /**< used for cleaning up presentation markup */
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
TidyAllocator * allocator ; /**< allocator */
2011-11-17 02:44:16 +00:00
} ;
2021-07-28 23:45:57 +00:00
/**
* modes for GetToken ( )
*
* MixedContent - - for elements which don ' t accept PCDATA
* Preformatted - - white space preserved as is
* IgnoreMarkup - - for CDATA elements such as script , style
*/
typedef enum
{
IgnoreWhitespace ,
MixedContent ,
Preformatted ,
IgnoreMarkup ,
OtherNamespace ,
CdataContent
} GetTokenMode ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/** @name Lexer Functions
* @ {
*/
/**
* Choose what version to use for new doctype
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE int TY_ ( HTMLVersion ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Everything is allowed in proprietary version of HTML .
* This is handled here rather than in the tag / attr dicts
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( ConstrainVersion ) ( TidyDocImpl * doc , uint vers ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( IsWhite ) ( uint c ) ;
TY_PRIVATE Bool TY_ ( IsDigit ) ( uint c ) ;
TY_PRIVATE Bool TY_ ( IsLetter ) ( uint c ) ;
TY_PRIVATE Bool TY_ ( IsHTMLSpace ) ( uint c ) ;
TY_PRIVATE Bool TY_ ( IsNewline ) ( uint c ) ;
TY_PRIVATE Bool TY_ ( IsNamechar ) ( uint c ) ;
TY_PRIVATE Bool TY_ ( IsXMLLetter ) ( uint c ) ;
TY_PRIVATE Bool TY_ ( IsXMLNamechar ) ( uint c ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( IsUpper ) ( uint c ) ;
TY_PRIVATE uint TY_ ( ToLower ) ( uint c ) ;
TY_PRIVATE uint TY_ ( ToUpper ) ( uint c ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Lexer * TY_ ( NewLexer ) ( TidyDocImpl * doc ) ;
TY_PRIVATE void TY_ ( FreeLexer ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Store character c as UTF - 8 encoded byte stream
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( AddCharToLexer ) ( Lexer * lexer , uint c ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Used for elements and text nodes .
* - Element name is NULL for text nodes .
* - start and end are offsets into lexbuf ,
* which contains the textual content of
* all elements in the parse tree .
* - parent and content allow traversal
* of the parse tree in any direction .
* - attributes are represented as a linked
* list of AttVal nodes which hold the
* strings for attribute / value pairs .
2011-11-17 02:44:16 +00:00
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( NewNode ) ( TidyAllocator * allocator , Lexer * lexer ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Used to clone heading nodes when split by an ` < HR > `
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( CloneNode ) ( TidyDocImpl * doc , Node * element ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Free node ' s attributes
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( FreeAttrs ) ( TidyDocImpl * doc , Node * node ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Doesn ' t repair attribute list linkage
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( FreeAttribute ) ( TidyDocImpl * doc , AttVal * av ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Detach attribute from node
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( DetachAttribute ) ( Node * node , AttVal * attr ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Detach attribute from node then free it .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( RemoveAttribute ) ( TidyDocImpl * doc , Node * node , AttVal * attr ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Free document nodes by iterating through peers and recursing
* through children . Set ` next ` to ` NULL ` before calling ` FreeNode ( ) `
* to avoid freeing peer nodes . Doesn ' t patch up prev / next links .
2011-11-17 02:44:16 +00:00
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( FreeNode ) ( TidyDocImpl * doc , Node * node ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( TextToken ) ( Lexer * lexer ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Used for creating preformatted text from Word2000 .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( NewLineNode ) ( Lexer * lexer ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Used for adding a & nbsp ; for Word2000 .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( NewLiteralTextNode ) ( Lexer * lexer , ctmbstr txt ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
TY_PRIVATE void TY_ ( AddStringLiteral ) ( Lexer * lexer , ctmbstr str ) ;
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( FindDocType ) ( TidyDocImpl * doc ) ;
TY_PRIVATE Node * TY_ ( FindHTML ) ( TidyDocImpl * doc ) ;
TY_PRIVATE Node * TY_ ( FindHEAD ) ( TidyDocImpl * doc ) ;
TY_PRIVATE Node * TY_ ( FindTITLE ) ( TidyDocImpl * doc ) ;
TY_PRIVATE Node * TY_ ( FindBody ) ( TidyDocImpl * doc ) ;
TY_PRIVATE Node * TY_ ( FindXmlDecl ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Returns containing block element , if any
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( FindContainer ) ( Node * node ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Add meta element for Tidy .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( AddGenerator ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE uint TY_ ( ApparentVersion ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE ctmbstr TY_ ( HTMLVersionNameFromCode ) ( uint vers , Bool isXhtml ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE uint TY_ ( HTMLVersionNumberFromCode ) ( uint vers ) ;
2017-03-19 19:41:51 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( WarnMissingSIInEmittedDocType ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( SetXHTMLDocType ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Fixup doctype if missing .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( FixDocType ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Ensure XML document starts with < ? xml version = " 1.0 " ? > , and
* add encoding attribute if not using ASCII or UTF - 8 output .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( FixXmlDecl ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( InferredTag ) ( TidyDocImpl * doc , TidyTagId id ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( UngetToken ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( GetToken ) ( TidyDocImpl * doc , GetTokenMode mode ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( InitMap ) ( void ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Create a new attribute .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE AttVal * TY_ ( NewAttribute ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Create a new attribute with given name and value .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE AttVal * TY_ ( NewAttributeEx ) ( TidyDocImpl * doc , ctmbstr name , ctmbstr value ,
2011-11-17 02:44:16 +00:00
int delim ) ;
2021-07-28 23:45:57 +00:00
/**
* Insert attribute at the end of attribute list of a node .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( InsertAttributeAtEnd ) ( Node * node , AttVal * av ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Insert attribute at the start of attribute list of a node .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( InsertAttributeAtStart ) ( Node * node , AttVal * av ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/** @}
* @ name Inline Stack Functions
* @ {
*/
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Duplicate attributes .
*/
TY_PRIVATE AttVal * TY_ ( DupAttrs ) ( TidyDocImpl * doc , AttVal * attrs ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Push a copy of an inline node onto stack , but don ' t push if
* implicit or OBJECT or APPLET ( implicit tags are ones generated
* from the istack ) .
*
* One issue arises with pushing inlines when the tag is already pushed .
* For instance :
* ~ ~ ~
* < p > < em > text
* < p > < em > more text
* ~ ~ ~
* Shouldn ' t be mapped to
* ~ ~ ~
* < p > < em > text < / em > < / p >
* < p > < em > < em > more text < / em > < / em >
* ~ ~ ~
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( PushInline ) ( TidyDocImpl * doc , Node * node ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Pop inline stack .
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE void TY_ ( PopInline ) ( TidyDocImpl * doc , Node * node ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( IsPushed ) ( TidyDocImpl * doc , Node * node ) ;
TY_PRIVATE Bool TY_ ( IsPushedLast ) ( TidyDocImpl * doc , Node * element , Node * node ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* This has the effect of inserting " missing " inline elements around the
* contents of blocklevel elements such as P , TD , TH , DIV , PRE etc . This
* procedure is called at the start of ` ParseBlock ` , when the inline
* stack is not empty , as will be the case in :
* ~ ~ ~
* < i > < h1 > italic heading < / h1 > < / i >
* ~ ~ ~
* which is then treated as equivalent to
* ~ ~ ~
* < h1 > < i > italic heading < / i > < / h1 >
* ~ ~ ~
* This is implemented by setting the lexer into a mode where it gets
* tokens from the inline stack rather than from the input stream .
*/
TY_PRIVATE int TY_ ( InlineDup ) ( TidyDocImpl * doc , Node * node ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Fefer duplicates when entering a table or other
* element where the inlines shouldn ' t be duplicated .
*/
TY_PRIVATE void TY_ ( DeferDup ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Node * TY_ ( InsertedToken ) ( TidyDocImpl * doc ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/**
* Stack manipulation for inline elements
*/
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( SwitchInline ) ( TidyDocImpl * doc , Node * element , Node * node ) ;
2021-07-28 23:45:57 +00:00
2021-07-10 15:13:58 +00:00
TY_PRIVATE Bool TY_ ( InlineDup1 ) ( TidyDocImpl * doc , Node * node , Node * element ) ;
2011-11-17 02:44:16 +00:00
2021-07-28 23:45:57 +00:00
/** @}
* @ name Generic stack of nodes .
* @ {
*/
/**
* This typedef represents a stack of addresses to nodes . Tidy uses these to
* try to limit recursion by pushing nodes to a stack when possible instead
* of recursing .
*/
typedef struct _Stack {
int top ; /**< Current top position. */
unsigned capacity ; /**< Current capacity. Can be expanded. */
Node * * firstNode ; /** A pointer to the first pointer to a Node in an array of node addresses. */
TidyAllocator * allocator ; /**< Tidy's allocator, used at instantiation and expanding. */
} Stack ;
/**
* Create a new stack with a given starting capacity . If memory allocation
* fails , then the allocator will panic the program automatically .
*/
TY_PRIVATE Stack * TY_ ( newStack ) ( TidyDocImpl * doc , uint capacity ) ;
/**
* Increase the stack size . This will be called automatically when the
* current stack is full . If memory allocation fails , then the allocator
* will panic the program automatically .
*/
TY_PRIVATE void TY_ ( growStack ) ( Stack * stack ) ;
/**
* Stack is full when top is equal to the last index .
*/
TY_PRIVATE Bool TY_ ( stackFull ) ( Stack * stack ) ;
/**
* Stack is empty when top is equal to - 1
*/
TY_PRIVATE Bool TY_ ( stackEmpty ) ( Stack * stack ) ;
/**
* Push an item to the stack .
*/
TY_PRIVATE void TY_ ( push ) ( Stack * stack , Node * node ) ;
/**
* Pop an item from the stack .
*/
TY_PRIVATE Node * TY_ ( pop ) ( Stack * stack ) ;
/**
* Peek at the stack .
*/
TY_PRIVATE Node * TY_ ( peek ) ( Stack * stack ) ;
/**
* Frees the stack when done .
*/
TY_PRIVATE void TY_ ( freeStack ) ( Stack * stack ) ;
/** @}
*/
2011-11-17 02:44:16 +00:00
# ifdef __cplusplus
}
# endif
2021-07-28 23:45:57 +00:00
/** @} end parser_h group */
/** @} end internal_api group */
2011-11-17 02:44:16 +00:00
# endif /* __LEXER_H__ */