474 lines
17 KiB
C
474 lines
17 KiB
C
#ifndef __TAGS_H__
|
|
#define __TAGS_H__
|
|
|
|
/**************************************************************************//**
|
|
* @file
|
|
* Recognize HTML Tags.
|
|
*
|
|
* The HTML tags are stored as 8 bit ASCII strings.
|
|
* Use lookupw() to find a tag given a wide char string.
|
|
*
|
|
* @author HTACG, et al (consult git log)
|
|
*
|
|
* @copyright
|
|
* Copyright (c) 1998-2017 World Wide Web Consortium (Massachusetts
|
|
* Institute of Technology, European Research Consortium for Informatics
|
|
* and Mathematics, Keio University) and HTACG.
|
|
* @par
|
|
* All Rights Reserved.
|
|
* @par
|
|
* See `tidy.h` for the complete license.
|
|
*
|
|
* @date Additional updates: consult git log
|
|
*
|
|
******************************************************************************/
|
|
|
|
#include "forward.h"
|
|
#include "attrdict.h"
|
|
|
|
/** @addtogroup internal_api */
|
|
/** @{ */
|
|
|
|
|
|
/***************************************************************************//**
|
|
** @defgroup tags_h HTML Tags
|
|
**
|
|
** This module organizes all of Tidy's HTML tag operations, such as parsing
|
|
** tags, defining tags, and user-defined tags.
|
|
**
|
|
** @{
|
|
******************************************************************************/
|
|
|
|
|
|
/** @name Basic Structures and Tag Operations.
|
|
** These structures form the backbone of Tidy tag processing, and the
|
|
** functions in this group provide basic operations with tags and nodes.
|
|
*/
|
|
/** @{ */
|
|
|
|
|
|
/** This enumeration defines the types of user-defined tags that can be
|
|
** created.
|
|
*/
|
|
typedef enum
|
|
{
|
|
tagtype_null = 0, /**< First item marker. */
|
|
tagtype_empty = 1, /**< Tag is an empty element. */
|
|
tagtype_inline = 2, /**< Tag is an inline element. */
|
|
tagtype_block = 4, /**< Tag is a block level element. */
|
|
tagtype_pre = 8 /**< Tag is a preformatted tag. */
|
|
} UserTagType;
|
|
|
|
|
|
/** This typedef describes a function to be used to parse HTML of a Tidy tag.
|
|
*/
|
|
typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
|
|
|
|
|
|
/** This typedef describes a function be be used to check the attributes
|
|
** of a Tidy tag.
|
|
*/
|
|
typedef void (CheckAttribs)( TidyDocImpl* doc, Node *node );
|
|
|
|
|
|
/** Defines a dictionary entry for a single Tidy tag, including all of the
|
|
** relevant information that it requires.
|
|
*/
|
|
struct _Dict
|
|
{
|
|
TidyTagId id; /**< Identifier for this tag. */
|
|
tmbstr name; /**< The tag name. */
|
|
uint versions; /**< Accumulates potential HTML versions. See TY_(ConstrainVersion). */
|
|
AttrVersion const * attrvers; /**< Accumulates potential HTML versions for attributes. */
|
|
uint model; /**< Indicates the relevant content models for the tag. See lexer.h; there is no enum. */
|
|
Parser* parser; /**< Specifies the parser to use for this tag. */
|
|
CheckAttribs* chkattrs; /**< Specifies the function to check this tag's attributes. */
|
|
Dict* next; /**< Link to next tag. */
|
|
};
|
|
|
|
|
|
/** This enum indicates the maximum size of the has table for tag hash lookup.
|
|
*/
|
|
enum
|
|
{
|
|
ELEMENT_HASH_SIZE=178u /**< Maximum number of tags in the hash table. */
|
|
};
|
|
|
|
|
|
/** This structure provide hash lookup for Tidy tags.
|
|
*/
|
|
typedef struct _DictHash
|
|
{
|
|
Dict const* tag; /**< The current tag. */
|
|
struct _DictHash* next; /**< The next tag. */
|
|
} DictHash;
|
|
|
|
|
|
/** This structure consists of the lists of all tags known to Tidy.
|
|
*/
|
|
typedef struct _TidyTagImpl
|
|
{
|
|
Dict* xml_tags; /**< Placeholder for all xml tags. */
|
|
Dict* declared_tag_list; /**< User-declared tags. */
|
|
DictHash* hashtab[ELEMENT_HASH_SIZE]; /**< All of Tidy's built-in tags. */
|
|
} TidyTagImpl;
|
|
|
|
|
|
/** Coordinates Config update and Tags data.
|
|
** @param doc The Tidy document.
|
|
** @param opt The option the tag is intended for.
|
|
** @param name The name of the new tag.
|
|
*/
|
|
TY_PRIVATE void TY_(DeclareUserTag)( TidyDocImpl* doc, const TidyOptionImpl* opt, ctmbstr name );
|
|
|
|
|
|
/** Interface for finding a tag by TidyTagId.
|
|
** @param tid The TidyTagId to search for.
|
|
** @returns An instance of a Tidy tag.
|
|
*/
|
|
TY_PRIVATE const Dict* TY_(LookupTagDef)( TidyTagId tid );
|
|
|
|
/** Assigns the node's tag.
|
|
** @param doc The Tidy document.
|
|
** @param node The node to assign the tag to.
|
|
** @returns Returns a bool indicating whether or not the tag was assigned.
|
|
*/
|
|
TY_PRIVATE Bool TY_(FindTag)( TidyDocImpl* doc, Node *node );
|
|
|
|
|
|
/** Finds the parser function for a given node.
|
|
** @param doc The Tidy document.
|
|
** @param node The node to lookup.
|
|
** @returns The parser for the given node.
|
|
*/
|
|
TY_PRIVATE Parser* TY_(FindParser)( TidyDocImpl* doc, Node *node );
|
|
|
|
|
|
/** Defines a new user-defined tag.
|
|
** @param doc The Tidy document.
|
|
** @param tagType The type of user-defined tag to define.
|
|
** @param name The name of the new tag.
|
|
*/
|
|
TY_PRIVATE void TY_(DefineTag)( TidyDocImpl* doc, UserTagType tagType, ctmbstr name );
|
|
|
|
|
|
/** Frees user-defined tags of the given type, or all user tags in given
|
|
** `tagtype_null`.
|
|
** @param doc The Tidy document.
|
|
** @param tagType The type of tag to free, or `tagtype_null` to free all
|
|
** user-defined tags.
|
|
*/
|
|
TY_PRIVATE void TY_(FreeDeclaredTags)( TidyDocImpl* doc, UserTagType tagType );
|
|
|
|
|
|
/** Initiates an iterator for a list of user-declared tags, including autonomous
|
|
** custom tags detected in the document if @ref TidyUseCustomTags is not set to
|
|
** **no**.
|
|
** @param doc An instance of a TidyDocImp to query.
|
|
** @result Returns a TidyIterator, which is a token used to represent the
|
|
** current position in a list within LibTidy.
|
|
*/
|
|
TY_PRIVATE TidyIterator TY_(GetDeclaredTagList)( TidyDocImpl* doc );
|
|
|
|
|
|
/** Given a valid TidyIterator initiated with TY_(GetDeclaredTagList)(),
|
|
** returns a string representing a user-declared or autonomous custom tag.
|
|
** @remark Specifying tagType limits the scope of the tags to one of
|
|
** @ref UserTagType types. Note that autonomous custom tags (if used)
|
|
** are added to one of these option types, depending on the value of
|
|
** @ref TidyUseCustomTags.
|
|
** @param doc The Tidy document.
|
|
** @param tagType The type of tag to iterate through.
|
|
** @param iter The iterator token provided initially by
|
|
** TY_(GetDeclaredTagList)().
|
|
** @result A string containing the next tag.
|
|
*/
|
|
TY_PRIVATE ctmbstr TY_(GetNextDeclaredTag)( TidyDocImpl* doc, UserTagType tagType,
|
|
TidyIterator* iter );
|
|
|
|
|
|
/** Initializes tags and tag structures for the given Tidy document.
|
|
** @param doc The Tidy document.
|
|
*/
|
|
TY_PRIVATE void TY_(InitTags)( TidyDocImpl* doc );
|
|
|
|
|
|
/** Frees the tags and structures used by Tidy for tags.
|
|
** @param doc The Tidy document.
|
|
*/
|
|
TY_PRIVATE void TY_(FreeTags)( TidyDocImpl* doc );
|
|
|
|
|
|
/** Tidy defaults to HTML5 mode. If the <!DOCTYPE ...> is found to NOT be
|
|
** HTML5, then adjust the tags table to HTML4 mode.
|
|
** @param doc The Tidy document.
|
|
*/
|
|
TY_PRIVATE void TY_(AdjustTags)( TidyDocImpl *doc );
|
|
|
|
|
|
/** Reset the tags table back to default HTML5 mode.
|
|
** @param doc The Tidy document.
|
|
*/
|
|
TY_PRIVATE void TY_(ResetTags)( TidyDocImpl *doc );
|
|
|
|
|
|
/** Indicates whether or not the Tidy is procesing in HTML5 mode.
|
|
** @param doc The Tidy document.
|
|
** @returns Returns `yes` if processing in HTML5 mode.
|
|
*/
|
|
TY_PRIVATE Bool TY_(IsHTML5Mode)( TidyDocImpl *doc );
|
|
|
|
|
|
/** @} */
|
|
/** @name Parser Methods And Attribute Checker Functions for Tags
|
|
** These functions define the parsers and attribute checking functions for
|
|
** each of Tidy's tags.
|
|
*/
|
|
/** @{ */
|
|
|
|
|
|
TY_PRIVATE Parser TY_(ParseHTML);
|
|
TY_PRIVATE Parser TY_(ParseHead);
|
|
TY_PRIVATE Parser TY_(ParseTitle);
|
|
TY_PRIVATE Parser TY_(ParseScript);
|
|
TY_PRIVATE Parser TY_(ParseFrameSet);
|
|
TY_PRIVATE Parser TY_(ParseNoFrames);
|
|
TY_PRIVATE Parser TY_(ParseBody);
|
|
TY_PRIVATE Parser TY_(ParsePre);
|
|
TY_PRIVATE Parser TY_(ParseList);
|
|
TY_PRIVATE Parser TY_(ParseDefList);
|
|
TY_PRIVATE Parser TY_(ParseBlock);
|
|
TY_PRIVATE Parser TY_(ParseInline);
|
|
TY_PRIVATE Parser TY_(ParseEmpty);
|
|
TY_PRIVATE Parser TY_(ParseTableTag);
|
|
TY_PRIVATE Parser TY_(ParseColGroup);
|
|
TY_PRIVATE Parser TY_(ParseRowGroup);
|
|
TY_PRIVATE Parser TY_(ParseRow);
|
|
TY_PRIVATE Parser TY_(ParseSelect);
|
|
TY_PRIVATE Parser TY_(ParseOptGroup);
|
|
TY_PRIVATE Parser TY_(ParseText);
|
|
TY_PRIVATE Parser TY_(ParseDatalist);
|
|
TY_PRIVATE Parser TY_(ParseNamespace);
|
|
|
|
TY_PRIVATE CheckAttribs TY_(CheckAttributes);
|
|
|
|
|
|
/** @} */
|
|
/** @name Other Tag and Node Lookup Functions
|
|
** These functions perform additional lookup on tags and nodes.
|
|
*/
|
|
/** @{ */
|
|
|
|
|
|
/** Gets the TidyTagId of the given node. 0 == TidyTag_UNKNOWN.
|
|
*/
|
|
#define TagId(node) ((node) && (node)->tag ? (node)->tag->id : TidyTag_UNKNOWN)
|
|
|
|
|
|
/** Determines if the given node is of the given tag id type.
|
|
*/
|
|
#define TagIsId(node, tid) ((node) && (node)->tag && (node)->tag->id == tid)
|
|
|
|
|
|
/** Inquires whether or not the given node is a text node.
|
|
** @param node The node being interrogated.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeIsText)( Node* node );
|
|
|
|
|
|
/** Inquires whether or not the given node is an element node.
|
|
** @param node The node being interrogated.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeIsElement)( Node* node );
|
|
|
|
|
|
/** Inquires whether or not the given node has any text.
|
|
** @param doc The Tidy document.
|
|
** @param node The node being interrogated.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeHasText)( TidyDocImpl* doc, Node* node );
|
|
|
|
|
|
/** Inquires whether the given element looks like it's an autonomous custom
|
|
** element tag.
|
|
** @param element A string to be checked.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(elementIsAutonomousCustomFormat)( ctmbstr element );
|
|
|
|
|
|
/** Inquires whether the given node looks like it's an autonomous custom
|
|
** element tag.
|
|
** @param node The node being interrogated.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeIsAutonomousCustomFormat)( Node* node );
|
|
|
|
|
|
/** True if the node looks like it's an autonomous custom element tag, and
|
|
** TidyCustomTags is not disabled, and we're in HTML5 mode, which are all
|
|
** requirements for valid autonomous custom tags.
|
|
** @param doc The Tidy document.
|
|
** @param node The node being interrogated.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeIsAutonomousCustomTag)( TidyDocImpl* doc, Node* node );
|
|
|
|
|
|
/** Does the node have the indicated content model? True if any of the bits
|
|
** requested are set.
|
|
** @param node The node being interrogated.
|
|
** @param contentModel The content model to check against.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeHasCM)( Node* node, uint contentModel );
|
|
|
|
|
|
/** Does the content model of the node include block?
|
|
** @param node The node being interrogated.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeCMIsBlock)( Node* node );
|
|
|
|
|
|
/** Does the content model of the node include inline?
|
|
** @param node The node being interrogated.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeCMIsInline)( Node* node );
|
|
|
|
|
|
/** Does the content model of the node include empty?
|
|
** @param node The node being interrogated.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeCMIsEmpty)( Node* node );
|
|
|
|
|
|
/** Is the node a header, such as H1, H2, ..., H6?
|
|
** @param node The node being interrogated.
|
|
** @returns The status of the inquiry.
|
|
*/
|
|
TY_PRIVATE Bool TY_(nodeIsHeader)( Node* node );
|
|
|
|
|
|
/** Inquires as to the header level of the given node: 1, 2, ..., 6.
|
|
** @param node The node being interrogated.
|
|
** @returns The header level.
|
|
*/
|
|
TY_PRIVATE uint TY_(nodeHeaderLevel)( Node* node );
|
|
|
|
|
|
#define nodeIsHTML( node ) TagIsId( node, TidyTag_HTML )
|
|
#define nodeIsHEAD( node ) TagIsId( node, TidyTag_HEAD )
|
|
#define nodeIsTITLE( node ) TagIsId( node, TidyTag_TITLE )
|
|
#define nodeIsBASE( node ) TagIsId( node, TidyTag_BASE )
|
|
#define nodeIsMETA( node ) TagIsId( node, TidyTag_META )
|
|
#define nodeIsBODY( node ) TagIsId( node, TidyTag_BODY )
|
|
#define nodeIsFRAMESET( node ) TagIsId( node, TidyTag_FRAMESET )
|
|
#define nodeIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
|
|
#define nodeIsIFRAME( node ) TagIsId( node, TidyTag_IFRAME )
|
|
#define nodeIsNOFRAMES( node ) TagIsId( node, TidyTag_NOFRAMES )
|
|
#define nodeIsHR( node ) TagIsId( node, TidyTag_HR )
|
|
#define nodeIsH1( node ) TagIsId( node, TidyTag_H1 )
|
|
#define nodeIsH2( node ) TagIsId( node, TidyTag_H2 )
|
|
#define nodeIsPRE( node ) TagIsId( node, TidyTag_PRE )
|
|
#define nodeIsLISTING( node ) TagIsId( node, TidyTag_LISTING )
|
|
#define nodeIsP( node ) TagIsId( node, TidyTag_P )
|
|
#define nodeIsUL( node ) TagIsId( node, TidyTag_UL )
|
|
#define nodeIsOL( node ) TagIsId( node, TidyTag_OL )
|
|
#define nodeIsDL( node ) TagIsId( node, TidyTag_DL )
|
|
#define nodeIsDIR( node ) TagIsId( node, TidyTag_DIR )
|
|
#define nodeIsLI( node ) TagIsId( node, TidyTag_LI )
|
|
#define nodeIsDT( node ) TagIsId( node, TidyTag_DT )
|
|
#define nodeIsDD( node ) TagIsId( node, TidyTag_DD )
|
|
#define nodeIsTABLE( node ) TagIsId( node, TidyTag_TABLE )
|
|
#define nodeIsCAPTION( node ) TagIsId( node, TidyTag_CAPTION )
|
|
#define nodeIsTD( node ) TagIsId( node, TidyTag_TD )
|
|
#define nodeIsTH( node ) TagIsId( node, TidyTag_TH )
|
|
#define nodeIsTR( node ) TagIsId( node, TidyTag_TR )
|
|
#define nodeIsCOL( node ) TagIsId( node, TidyTag_COL )
|
|
#define nodeIsCOLGROUP( node ) TagIsId( node, TidyTag_COLGROUP )
|
|
#define nodeIsBR( node ) TagIsId( node, TidyTag_BR )
|
|
#define nodeIsA( node ) TagIsId( node, TidyTag_A )
|
|
#define nodeIsLINK( node ) TagIsId( node, TidyTag_LINK )
|
|
#define nodeIsB( node ) TagIsId( node, TidyTag_B )
|
|
#define nodeIsI( node ) TagIsId( node, TidyTag_I )
|
|
#define nodeIsSTRONG( node ) TagIsId( node, TidyTag_STRONG )
|
|
#define nodeIsEM( node ) TagIsId( node, TidyTag_EM )
|
|
#define nodeIsBIG( node ) TagIsId( node, TidyTag_BIG )
|
|
#define nodeIsSMALL( node ) TagIsId( node, TidyTag_SMALL )
|
|
#define nodeIsPARAM( node ) TagIsId( node, TidyTag_PARAM )
|
|
#define nodeIsOPTION( node ) TagIsId( node, TidyTag_OPTION )
|
|
#define nodeIsOPTGROUP( node ) TagIsId( node, TidyTag_OPTGROUP )
|
|
#define nodeIsIMG( node ) TagIsId( node, TidyTag_IMG )
|
|
#define nodeIsMAP( node ) TagIsId( node, TidyTag_MAP )
|
|
#define nodeIsAREA( node ) TagIsId( node, TidyTag_AREA )
|
|
#define nodeIsNOBR( node ) TagIsId( node, TidyTag_NOBR )
|
|
#define nodeIsWBR( node ) TagIsId( node, TidyTag_WBR )
|
|
#define nodeIsFONT( node ) TagIsId( node, TidyTag_FONT )
|
|
#define nodeIsLAYER( node ) TagIsId( node, TidyTag_LAYER )
|
|
#define nodeIsSPACER( node ) TagIsId( node, TidyTag_SPACER )
|
|
#define nodeIsCENTER( node ) TagIsId( node, TidyTag_CENTER )
|
|
#define nodeIsSTYLE( node ) TagIsId( node, TidyTag_STYLE )
|
|
#define nodeIsSCRIPT( node ) TagIsId( node, TidyTag_SCRIPT )
|
|
#define nodeIsNOSCRIPT( node ) TagIsId( node, TidyTag_NOSCRIPT )
|
|
#define nodeIsFORM( node ) TagIsId( node, TidyTag_FORM )
|
|
#define nodeIsTEXTAREA( node ) TagIsId( node, TidyTag_TEXTAREA )
|
|
#define nodeIsBLOCKQUOTE( node ) TagIsId( node, TidyTag_BLOCKQUOTE )
|
|
#define nodeIsAPPLET( node ) TagIsId( node, TidyTag_APPLET )
|
|
#define nodeIsOBJECT( node ) TagIsId( node, TidyTag_OBJECT )
|
|
#define nodeIsDIV( node ) TagIsId( node, TidyTag_DIV )
|
|
#define nodeIsSPAN( node ) TagIsId( node, TidyTag_SPAN )
|
|
#define nodeIsINPUT( node ) TagIsId( node, TidyTag_INPUT )
|
|
#define nodeIsQ( node ) TagIsId( node, TidyTag_Q )
|
|
#define nodeIsLABEL( node ) TagIsId( node, TidyTag_LABEL )
|
|
#define nodeIsH3( node ) TagIsId( node, TidyTag_H3 )
|
|
#define nodeIsH4( node ) TagIsId( node, TidyTag_H4 )
|
|
#define nodeIsH5( node ) TagIsId( node, TidyTag_H5 )
|
|
#define nodeIsH6( node ) TagIsId( node, TidyTag_H6 )
|
|
#define nodeIsADDRESS( node ) TagIsId( node, TidyTag_ADDRESS )
|
|
#define nodeIsXMP( node ) TagIsId( node, TidyTag_XMP )
|
|
#define nodeIsSELECT( node ) TagIsId( node, TidyTag_SELECT )
|
|
#define nodeIsBLINK( node ) TagIsId( node, TidyTag_BLINK )
|
|
#define nodeIsMARQUEE( node ) TagIsId( node, TidyTag_MARQUEE )
|
|
#define nodeIsEMBED( node ) TagIsId( node, TidyTag_EMBED )
|
|
#define nodeIsBASEFONT( node ) TagIsId( node, TidyTag_BASEFONT )
|
|
#define nodeIsISINDEX( node ) TagIsId( node, TidyTag_ISINDEX )
|
|
#define nodeIsS( node ) TagIsId( node, TidyTag_S )
|
|
#define nodeIsSTRIKE( node ) TagIsId( node, TidyTag_STRIKE )
|
|
#define nodeIsSUB( node ) TagIsId( node, TidyTag_SUB )
|
|
#define nodeIsSUP( node ) TagIsId( node, TidyTag_SUP )
|
|
#define nodeIsU( node ) TagIsId( node, TidyTag_U )
|
|
#define nodeIsMENU( node ) TagIsId( node, TidyTag_MENU )
|
|
#define nodeIsMAIN( node ) TagIsId( node, TidyTag_MAIN )
|
|
#define nodeIsBUTTON( node ) TagIsId( node, TidyTag_BUTTON )
|
|
#define nodeIsCANVAS( node ) TagIsId( node, TidyTag_CANVAS )
|
|
#define nodeIsPROGRESS( node ) TagIsId( node, TidyTag_PROGRESS )
|
|
|
|
#define nodeIsINS( node ) TagIsId( node, TidyTag_INS )
|
|
#define nodeIsDEL( node ) TagIsId( node, TidyTag_DEL )
|
|
|
|
#define nodeIsSVG( node ) TagIsId( node, TidyTag_SVG )
|
|
|
|
/* HTML5 */
|
|
#define nodeIsDATALIST( node ) TagIsId( node, TidyTag_DATALIST )
|
|
#define nodeIsDATA( node ) TagIsId( node, TidyTag_DATA )
|
|
#define nodeIsMATHML( node ) TagIsId( node, TidyTag_MATHML ) /* #130 MathML attr and entity fix! */
|
|
|
|
/* NOT in HTML 5 */
|
|
#define nodeIsACRONYM( node ) TagIsId( node, TidyTag_ACRONYM )
|
|
#define nodesIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
|
|
#define nodeIsTT( node ) TagIsId( node, TidyTag_TT )
|
|
|
|
|
|
/** @} name */
|
|
/** @} tags_h group */
|
|
/** @} internal_api addtogroup */
|
|
|
|
|
|
#endif /* __TAGS_H__ */
|