tidy-html5/src/tags.h
Jim Derry 91f29ea7b8 HTML Tidy now parses HTML non-recursively.
Instead of recursive calls for each nested level of HTML, the next level is
pushed to a stack on the heap, and returned to the main loop. This prevents
stack overflow at _n_ depth (where _n_ is operating-system dependent). It's
probably still possible to use all of the heap memory, but Tidy's allocators
already fail gracefully in this circumstance.

Please report any regressions of your own HTML!

NOTE: the XML parser is not affected, and is probably still highly recursive.
2021-08-14 20:42:43 -04:00

479 lines
18 KiB
C

#ifndef __TAGS_H__
#define __TAGS_H__
/**************************************************************************//**
* @file
* Recognize HTML Tags.
*
* The HTML tags are stored as 8 bit ASCII strings.
* Use lookupw() to find a tag given a wide char string.
*
* @author HTACG, et al (consult git log)
*
* @copyright
* Copyright (c) 1998-2017 World Wide Web Consortium (Massachusetts
* Institute of Technology, European Research Consortium for Informatics
* and Mathematics, Keio University) and HTACG.
* @par
* All Rights Reserved.
* @par
* See `tidy.h` for the complete license.
*
* @date Additional updates: consult git log
*
******************************************************************************/
#include "forward.h"
#include "attrdict.h"
/** @addtogroup internal_api */
/** @{ */
/***************************************************************************//**
** @defgroup tags_h HTML Tags
**
** This module organizes all of Tidy's HTML tag operations, such as parsing
** tags, defining tags, and user-defined tags.
**
** @{
******************************************************************************/
/** @name Basic Structures and Tag Operations.
** These structures form the backbone of Tidy tag processing, and the
** functions in this group provide basic operations with tags and nodes.
*/
/** @{ */
/** This enumeration defines the types of user-defined tags that can be
** created.
*/
typedef enum
{
tagtype_null = 0, /**< First item marker. */
tagtype_empty = 1, /**< Tag is an empty element. */
tagtype_inline = 2, /**< Tag is an inline element. */
tagtype_block = 4, /**< Tag is a block level element. */
tagtype_pre = 8 /**< Tag is a preformatted tag. */
} UserTagType;
/** This typedef describes a function to be used to parse HTML of a Tidy tag.
** @param doc The Tidy document.
** @param node The node being parsed.
** @param mode The GetTokenMode to be used for parsing the node contents.
** @param popStack A flag indicating that we are re-entering this parser, and
** it should restore a state from the stack.
*/
typedef Node* (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
/** This typedef describes a function be be used to check the attributes
** of a Tidy tag.
*/
typedef void (CheckAttribs)( TidyDocImpl* doc, Node *node );
/** Defines a dictionary entry for a single Tidy tag, including all of the
** relevant information that it requires.
*/
struct _Dict
{
TidyTagId id; /**< Identifier for this tag. */
tmbstr name; /**< The tag name. */
uint versions; /**< Accumulates potential HTML versions. See TY_(ConstrainVersion). */
AttrVersion const * attrvers; /**< Accumulates potential HTML versions for attributes. */
uint model; /**< Indicates the relevant content models for the tag. See lexer.h; there is no enum. */
Parser* parser; /**< Specifies the parser to use for this tag. */
CheckAttribs* chkattrs; /**< Specifies the function to check this tag's attributes. */
Dict* next; /**< Link to next tag. */
};
/** This enum indicates the maximum size of the has table for tag hash lookup.
*/
enum
{
ELEMENT_HASH_SIZE=178u /**< Maximum number of tags in the hash table. */
};
/** This structure provide hash lookup for Tidy tags.
*/
typedef struct _DictHash
{
Dict const* tag; /**< The current tag. */
struct _DictHash* next; /**< The next tag. */
} DictHash;
/** This structure consists of the lists of all tags known to Tidy.
*/
typedef struct _TidyTagImpl
{
Dict* xml_tags; /**< Placeholder for all xml tags. */
Dict* declared_tag_list; /**< User-declared tags. */
DictHash* hashtab[ELEMENT_HASH_SIZE]; /**< All of Tidy's built-in tags. */
} TidyTagImpl;
/** Coordinates Config update and Tags data.
** @param doc The Tidy document.
** @param opt The option the tag is intended for.
** @param name The name of the new tag.
*/
TY_PRIVATE void TY_(DeclareUserTag)( TidyDocImpl* doc, const TidyOptionImpl* opt, ctmbstr name );
/** Interface for finding a tag by TidyTagId.
** @param tid The TidyTagId to search for.
** @returns An instance of a Tidy tag.
*/
TY_PRIVATE const Dict* TY_(LookupTagDef)( TidyTagId tid );
/** Assigns the node's tag.
** @param doc The Tidy document.
** @param node The node to assign the tag to.
** @returns Returns a bool indicating whether or not the tag was assigned.
*/
TY_PRIVATE Bool TY_(FindTag)( TidyDocImpl* doc, Node *node );
/** Finds the parser function for a given node.
** @param doc The Tidy document.
** @param node The node to lookup.
** @returns The parser for the given node.
*/
TY_PRIVATE Parser* TY_(FindParser)( TidyDocImpl* doc, Node *node );
/** Defines a new user-defined tag.
** @param doc The Tidy document.
** @param tagType The type of user-defined tag to define.
** @param name The name of the new tag.
*/
TY_PRIVATE void TY_(DefineTag)( TidyDocImpl* doc, UserTagType tagType, ctmbstr name );
/** Frees user-defined tags of the given type, or all user tags in given
** `tagtype_null`.
** @param doc The Tidy document.
** @param tagType The type of tag to free, or `tagtype_null` to free all
** user-defined tags.
*/
TY_PRIVATE void TY_(FreeDeclaredTags)( TidyDocImpl* doc, UserTagType tagType );
/** Initiates an iterator for a list of user-declared tags, including autonomous
** custom tags detected in the document if @ref TidyUseCustomTags is not set to
** **no**.
** @param doc An instance of a TidyDocImp to query.
** @result Returns a TidyIterator, which is a token used to represent the
** current position in a list within LibTidy.
*/
TY_PRIVATE TidyIterator TY_(GetDeclaredTagList)( TidyDocImpl* doc );
/** Given a valid TidyIterator initiated with TY_(GetDeclaredTagList)(),
** returns a string representing a user-declared or autonomous custom tag.
** @remark Specifying tagType limits the scope of the tags to one of
** @ref UserTagType types. Note that autonomous custom tags (if used)
** are added to one of these option types, depending on the value of
** @ref TidyUseCustomTags.
** @param doc The Tidy document.
** @param tagType The type of tag to iterate through.
** @param iter The iterator token provided initially by
** TY_(GetDeclaredTagList)().
** @result A string containing the next tag.
*/
TY_PRIVATE ctmbstr TY_(GetNextDeclaredTag)( TidyDocImpl* doc, UserTagType tagType,
TidyIterator* iter );
/** Initializes tags and tag structures for the given Tidy document.
** @param doc The Tidy document.
*/
TY_PRIVATE void TY_(InitTags)( TidyDocImpl* doc );
/** Frees the tags and structures used by Tidy for tags.
** @param doc The Tidy document.
*/
TY_PRIVATE void TY_(FreeTags)( TidyDocImpl* doc );
/** Tidy defaults to HTML5 mode. If the <!DOCTYPE ...> is found to NOT be
** HTML5, then adjust the tags table to HTML4 mode.
** @param doc The Tidy document.
*/
TY_PRIVATE void TY_(AdjustTags)( TidyDocImpl *doc );
/** Reset the tags table back to default HTML5 mode.
** @param doc The Tidy document.
*/
TY_PRIVATE void TY_(ResetTags)( TidyDocImpl *doc );
/** Indicates whether or not the Tidy is processing in HTML5 mode.
** @param doc The Tidy document.
** @returns Returns `yes` if processing in HTML5 mode.
*/
TY_PRIVATE Bool TY_(IsHTML5Mode)( TidyDocImpl *doc );
/** @} */
/** @name Parser Methods And Attribute Checker Functions for Tags
** These functions define the parsers and attribute checking functions for
** each of Tidy's tags.
*/
/** @{ */
TY_PRIVATE Parser TY_(ParseHTML);
TY_PRIVATE Parser TY_(ParseHead);
TY_PRIVATE Parser TY_(ParseTitle);
TY_PRIVATE Parser TY_(ParseScript);
TY_PRIVATE Parser TY_(ParseFrameSet);
TY_PRIVATE Parser TY_(ParseNoFrames);
TY_PRIVATE Parser TY_(ParseBody);
TY_PRIVATE Parser TY_(ParsePre);
TY_PRIVATE Parser TY_(ParseList);
TY_PRIVATE Parser TY_(ParseDefList);
TY_PRIVATE Parser TY_(ParseBlock);
TY_PRIVATE Parser TY_(ParseInline);
TY_PRIVATE Parser TY_(ParseEmpty);
TY_PRIVATE Parser TY_(ParseTableTag);
TY_PRIVATE Parser TY_(ParseColGroup);
TY_PRIVATE Parser TY_(ParseRowGroup);
TY_PRIVATE Parser TY_(ParseRow);
TY_PRIVATE Parser TY_(ParseSelect);
TY_PRIVATE Parser TY_(ParseOptGroup);
TY_PRIVATE Parser TY_(ParseText);
TY_PRIVATE Parser TY_(ParseDatalist);
TY_PRIVATE Parser TY_(ParseNamespace);
TY_PRIVATE CheckAttribs TY_(CheckAttributes);
/** @} */
/** @name Other Tag and Node Lookup Functions
** These functions perform additional lookup on tags and nodes.
*/
/** @{ */
/** Gets the TidyTagId of the given node. 0 == TidyTag_UNKNOWN.
*/
#define TagId(node) ((node) && (node)->tag ? (node)->tag->id : TidyTag_UNKNOWN)
/** Determines if the given node is of the given tag id type.
*/
#define TagIsId(node, tid) ((node) && (node)->tag && (node)->tag->id == tid)
/** Inquires whether or not the given node is a text node.
** @param node The node being interrogated.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeIsText)( Node* node );
/** Inquires whether or not the given node is an element node.
** @param node The node being interrogated.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeIsElement)( Node* node );
/** Inquires whether or not the given node has any text.
** @param doc The Tidy document.
** @param node The node being interrogated.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeHasText)( TidyDocImpl* doc, Node* node );
/** Inquires whether the given element looks like it's an autonomous custom
** element tag.
** @param element A string to be checked.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(elementIsAutonomousCustomFormat)( ctmbstr element );
/** Inquires whether the given node looks like it's an autonomous custom
** element tag.
** @param node The node being interrogated.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeIsAutonomousCustomFormat)( Node* node );
/** True if the node looks like it's an autonomous custom element tag, and
** TidyCustomTags is not disabled, and we're in HTML5 mode, which are all
** requirements for valid autonomous custom tags.
** @param doc The Tidy document.
** @param node The node being interrogated.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeIsAutonomousCustomTag)( TidyDocImpl* doc, Node* node );
/** Does the node have the indicated content model? True if any of the bits
** requested are set.
** @param node The node being interrogated.
** @param contentModel The content model to check against.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeHasCM)( Node* node, uint contentModel );
/** Does the content model of the node include block?
** @param node The node being interrogated.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeCMIsBlock)( Node* node );
/** Does the content model of the node include inline?
** @param node The node being interrogated.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeCMIsInline)( Node* node );
/** Does the content model of the node include empty?
** @param node The node being interrogated.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeCMIsEmpty)( Node* node );
/** Is the node a header, such as H1, H2, ..., H6?
** @param node The node being interrogated.
** @returns The status of the inquiry.
*/
TY_PRIVATE Bool TY_(nodeIsHeader)( Node* node );
/** Inquires as to the header level of the given node: 1, 2, ..., 6.
** @param node The node being interrogated.
** @returns The header level.
*/
TY_PRIVATE uint TY_(nodeHeaderLevel)( Node* node );
#define nodeIsHTML( node ) TagIsId( node, TidyTag_HTML )
#define nodeIsHEAD( node ) TagIsId( node, TidyTag_HEAD )
#define nodeIsTITLE( node ) TagIsId( node, TidyTag_TITLE )
#define nodeIsBASE( node ) TagIsId( node, TidyTag_BASE )
#define nodeIsMETA( node ) TagIsId( node, TidyTag_META )
#define nodeIsBODY( node ) TagIsId( node, TidyTag_BODY )
#define nodeIsFRAMESET( node ) TagIsId( node, TidyTag_FRAMESET )
#define nodeIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
#define nodeIsIFRAME( node ) TagIsId( node, TidyTag_IFRAME )
#define nodeIsNOFRAMES( node ) TagIsId( node, TidyTag_NOFRAMES )
#define nodeIsHR( node ) TagIsId( node, TidyTag_HR )
#define nodeIsH1( node ) TagIsId( node, TidyTag_H1 )
#define nodeIsH2( node ) TagIsId( node, TidyTag_H2 )
#define nodeIsPRE( node ) TagIsId( node, TidyTag_PRE )
#define nodeIsLISTING( node ) TagIsId( node, TidyTag_LISTING )
#define nodeIsP( node ) TagIsId( node, TidyTag_P )
#define nodeIsUL( node ) TagIsId( node, TidyTag_UL )
#define nodeIsOL( node ) TagIsId( node, TidyTag_OL )
#define nodeIsDL( node ) TagIsId( node, TidyTag_DL )
#define nodeIsDIR( node ) TagIsId( node, TidyTag_DIR )
#define nodeIsLI( node ) TagIsId( node, TidyTag_LI )
#define nodeIsDT( node ) TagIsId( node, TidyTag_DT )
#define nodeIsDD( node ) TagIsId( node, TidyTag_DD )
#define nodeIsTABLE( node ) TagIsId( node, TidyTag_TABLE )
#define nodeIsCAPTION( node ) TagIsId( node, TidyTag_CAPTION )
#define nodeIsTD( node ) TagIsId( node, TidyTag_TD )
#define nodeIsTH( node ) TagIsId( node, TidyTag_TH )
#define nodeIsTR( node ) TagIsId( node, TidyTag_TR )
#define nodeIsCOL( node ) TagIsId( node, TidyTag_COL )
#define nodeIsCOLGROUP( node ) TagIsId( node, TidyTag_COLGROUP )
#define nodeIsBR( node ) TagIsId( node, TidyTag_BR )
#define nodeIsA( node ) TagIsId( node, TidyTag_A )
#define nodeIsLINK( node ) TagIsId( node, TidyTag_LINK )
#define nodeIsB( node ) TagIsId( node, TidyTag_B )
#define nodeIsI( node ) TagIsId( node, TidyTag_I )
#define nodeIsSTRONG( node ) TagIsId( node, TidyTag_STRONG )
#define nodeIsEM( node ) TagIsId( node, TidyTag_EM )
#define nodeIsBIG( node ) TagIsId( node, TidyTag_BIG )
#define nodeIsSMALL( node ) TagIsId( node, TidyTag_SMALL )
#define nodeIsPARAM( node ) TagIsId( node, TidyTag_PARAM )
#define nodeIsOPTION( node ) TagIsId( node, TidyTag_OPTION )
#define nodeIsOPTGROUP( node ) TagIsId( node, TidyTag_OPTGROUP )
#define nodeIsIMG( node ) TagIsId( node, TidyTag_IMG )
#define nodeIsMAP( node ) TagIsId( node, TidyTag_MAP )
#define nodeIsAREA( node ) TagIsId( node, TidyTag_AREA )
#define nodeIsNOBR( node ) TagIsId( node, TidyTag_NOBR )
#define nodeIsWBR( node ) TagIsId( node, TidyTag_WBR )
#define nodeIsFONT( node ) TagIsId( node, TidyTag_FONT )
#define nodeIsLAYER( node ) TagIsId( node, TidyTag_LAYER )
#define nodeIsSPACER( node ) TagIsId( node, TidyTag_SPACER )
#define nodeIsCENTER( node ) TagIsId( node, TidyTag_CENTER )
#define nodeIsSTYLE( node ) TagIsId( node, TidyTag_STYLE )
#define nodeIsSCRIPT( node ) TagIsId( node, TidyTag_SCRIPT )
#define nodeIsNOSCRIPT( node ) TagIsId( node, TidyTag_NOSCRIPT )
#define nodeIsFORM( node ) TagIsId( node, TidyTag_FORM )
#define nodeIsTEXTAREA( node ) TagIsId( node, TidyTag_TEXTAREA )
#define nodeIsBLOCKQUOTE( node ) TagIsId( node, TidyTag_BLOCKQUOTE )
#define nodeIsAPPLET( node ) TagIsId( node, TidyTag_APPLET )
#define nodeIsOBJECT( node ) TagIsId( node, TidyTag_OBJECT )
#define nodeIsDIV( node ) TagIsId( node, TidyTag_DIV )
#define nodeIsSPAN( node ) TagIsId( node, TidyTag_SPAN )
#define nodeIsINPUT( node ) TagIsId( node, TidyTag_INPUT )
#define nodeIsQ( node ) TagIsId( node, TidyTag_Q )
#define nodeIsLABEL( node ) TagIsId( node, TidyTag_LABEL )
#define nodeIsH3( node ) TagIsId( node, TidyTag_H3 )
#define nodeIsH4( node ) TagIsId( node, TidyTag_H4 )
#define nodeIsH5( node ) TagIsId( node, TidyTag_H5 )
#define nodeIsH6( node ) TagIsId( node, TidyTag_H6 )
#define nodeIsADDRESS( node ) TagIsId( node, TidyTag_ADDRESS )
#define nodeIsXMP( node ) TagIsId( node, TidyTag_XMP )
#define nodeIsSELECT( node ) TagIsId( node, TidyTag_SELECT )
#define nodeIsBLINK( node ) TagIsId( node, TidyTag_BLINK )
#define nodeIsMARQUEE( node ) TagIsId( node, TidyTag_MARQUEE )
#define nodeIsEMBED( node ) TagIsId( node, TidyTag_EMBED )
#define nodeIsBASEFONT( node ) TagIsId( node, TidyTag_BASEFONT )
#define nodeIsISINDEX( node ) TagIsId( node, TidyTag_ISINDEX )
#define nodeIsS( node ) TagIsId( node, TidyTag_S )
#define nodeIsSTRIKE( node ) TagIsId( node, TidyTag_STRIKE )
#define nodeIsSUB( node ) TagIsId( node, TidyTag_SUB )
#define nodeIsSUP( node ) TagIsId( node, TidyTag_SUP )
#define nodeIsU( node ) TagIsId( node, TidyTag_U )
#define nodeIsMENU( node ) TagIsId( node, TidyTag_MENU )
#define nodeIsMAIN( node ) TagIsId( node, TidyTag_MAIN )
#define nodeIsBUTTON( node ) TagIsId( node, TidyTag_BUTTON )
#define nodeIsCANVAS( node ) TagIsId( node, TidyTag_CANVAS )
#define nodeIsPROGRESS( node ) TagIsId( node, TidyTag_PROGRESS )
#define nodeIsINS( node ) TagIsId( node, TidyTag_INS )
#define nodeIsDEL( node ) TagIsId( node, TidyTag_DEL )
#define nodeIsSVG( node ) TagIsId( node, TidyTag_SVG )
/* HTML5 */
#define nodeIsDATALIST( node ) TagIsId( node, TidyTag_DATALIST )
#define nodeIsDATA( node ) TagIsId( node, TidyTag_DATA )
#define nodeIsMATHML( node ) TagIsId( node, TidyTag_MATHML ) /* #130 MathML attr and entity fix! */
/* NOT in HTML 5 */
#define nodeIsACRONYM( node ) TagIsId( node, TidyTag_ACRONYM )
#define nodesIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
#define nodeIsTT( node ) TagIsId( node, TidyTag_TT )
/** @} name */
/** @} tags_h group */
/** @} internal_api addtogroup */
#endif /* __TAGS_H__ */