commit
db847e6e1c
83
src/clean.c
83
src/clean.c
|
@ -1585,11 +1585,16 @@ void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
|
||||||
*/
|
*/
|
||||||
void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
|
void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
|
||||||
{
|
{
|
||||||
|
Stack *stack = TY_(newStack)(doc, 16);
|
||||||
|
Node *next;
|
||||||
|
|
||||||
tmbchar indent_buf[ 32 ];
|
tmbchar indent_buf[ 32 ];
|
||||||
uint indent;
|
uint indent;
|
||||||
|
|
||||||
while (node)
|
while (node)
|
||||||
{
|
{
|
||||||
|
next = node->next;
|
||||||
|
|
||||||
if ( nodeIsBLOCKQUOTE(node) && node->implicit )
|
if ( nodeIsBLOCKQUOTE(node) && node->implicit )
|
||||||
{
|
{
|
||||||
indent = 1;
|
indent = 1;
|
||||||
|
@ -1602,19 +1607,27 @@ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
|
||||||
StripOnlyChild( doc, node );
|
StripOnlyChild( doc, node );
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->content)
|
|
||||||
TY_(BQ2Div)( doc, node->content );
|
|
||||||
|
|
||||||
TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
|
TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
|
||||||
2*indent);
|
2*indent);
|
||||||
|
|
||||||
RenameElem( doc, node, TidyTag_DIV );
|
RenameElem( doc, node, TidyTag_DIV );
|
||||||
TY_(AddStyleProperty)(doc, node, indent_buf );
|
TY_(AddStyleProperty)(doc, node, indent_buf );
|
||||||
|
|
||||||
|
if (node->content)
|
||||||
|
{
|
||||||
|
TY_(push)(stack, next);
|
||||||
|
node = node->content;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (node->content)
|
else if (node->content)
|
||||||
TY_(BQ2Div)( doc, node->content );
|
{
|
||||||
|
TY_(push)(stack, next);
|
||||||
|
node = node->content;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
node = node->next;
|
node = next ? next : TY_(pop)(stack);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2736,30 +2749,42 @@ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
|
||||||
*/
|
*/
|
||||||
static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
|
static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
|
||||||
{
|
{
|
||||||
Node *next;
|
Stack *stack = TY_(newStack)(doc, 16);
|
||||||
while (node)
|
Node *next;
|
||||||
{
|
|
||||||
next = node->next; /* get 'next' now , in case the node is moved */
|
while (node)
|
||||||
/* dbg_show_node(doc, node, 0, indent); */
|
{
|
||||||
if (nodeIsSTYLE(node))
|
next = node->next;
|
||||||
{
|
|
||||||
if (fix)
|
if (nodeIsSTYLE(node))
|
||||||
{
|
{
|
||||||
TY_(RemoveNode)(node); /* unhook style node from body */
|
if (fix)
|
||||||
TY_(InsertNodeAtEnd)(head, node); /* add to end of head */
|
{
|
||||||
TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
|
TY_(RemoveNode)(node); /* unhook style node from body */
|
||||||
}
|
TY_(InsertNodeAtEnd)(head, node); /* add to end of head */
|
||||||
else
|
TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
|
||||||
{
|
}
|
||||||
TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
|
else
|
||||||
}
|
{
|
||||||
}
|
TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
|
||||||
else if (node->content)
|
}
|
||||||
{
|
}
|
||||||
StyleToHead(doc, head, node->content, fix, indent + 1);
|
else if (node->content)
|
||||||
}
|
{
|
||||||
node = next; /* process the 'next', if any */
|
TY_(push)(stack, next);
|
||||||
}
|
node = node->content;
|
||||||
|
indent++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (next)
|
||||||
|
node = next;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
node = TY_(pop)(stack);
|
||||||
|
indent--;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
122
src/lexer.c
122
src/lexer.c
|
@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str )
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
node->type is one of these:
|
|
||||||
|
|
||||||
#define TextNode 1
|
|
||||||
#define StartTag 2
|
|
||||||
#define EndTag 3
|
|
||||||
#define StartEndTag 4
|
|
||||||
*/
|
|
||||||
|
|
||||||
Lexer* TY_(NewLexer)( TidyDocImpl* doc )
|
Lexer* TY_(NewLexer)( TidyDocImpl* doc )
|
||||||
{
|
{
|
||||||
Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
|
Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
|
||||||
|
@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
/* this is no good ;=((
|
|
||||||
if (node && doc && doc->lexer) {
|
|
||||||
if (node == doc->lexer->token) {
|
|
||||||
doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
----------------- */
|
|
||||||
while ( node )
|
while ( node )
|
||||||
{
|
{
|
||||||
Node* next = node->next;
|
Node* next = node->next;
|
||||||
|
@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* local variables:
|
/****************************************************************************//*
|
||||||
* mode: c
|
** MARK: - Node Stack
|
||||||
* indent-tabs-mode: nil
|
***************************************************************************/
|
||||||
* c-basic-offset: 4
|
|
||||||
* eval: (c-set-offset 'substatement-open 0)
|
|
||||||
* end:
|
/**
|
||||||
|
* Create a new stack with a given starting capacity. If memory allocation
|
||||||
|
* fails, then the allocator will panic the program automatically.
|
||||||
*/
|
*/
|
||||||
|
Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity)
|
||||||
|
{
|
||||||
|
Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack));
|
||||||
|
stack->top = -1;
|
||||||
|
stack->capacity = capacity;
|
||||||
|
stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**));
|
||||||
|
stack->allocator = doc->allocator;
|
||||||
|
return stack;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increase the stack size. This will be called automatically when the
|
||||||
|
* current stack is full. If memory allocation fails, then the allocator
|
||||||
|
* will panic the program automatically.
|
||||||
|
*/
|
||||||
|
void TY_(growStack)(Stack *stack)
|
||||||
|
{
|
||||||
|
uint new_capacity = stack->capacity * 2;
|
||||||
|
|
||||||
|
Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity);
|
||||||
|
|
||||||
|
memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) );
|
||||||
|
TidyFree(stack->allocator, stack->firstNode);
|
||||||
|
|
||||||
|
stack->firstNode = firstNode;
|
||||||
|
stack->capacity = new_capacity;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stack is full when top is equal to the last index.
|
||||||
|
*/
|
||||||
|
Bool TY_(stackFull)(Stack *stack)
|
||||||
|
{
|
||||||
|
return stack->top == stack->capacity - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stack is empty when top is equal to -1
|
||||||
|
*/
|
||||||
|
Bool TY_(stackEmpty)(Stack *stack)
|
||||||
|
{
|
||||||
|
return stack->top == -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Push an item to the stack.
|
||||||
|
*/
|
||||||
|
void TY_(push)(Stack *stack, Node *node)
|
||||||
|
{
|
||||||
|
if (TY_(stackFull)(stack))
|
||||||
|
TY_(growStack)(stack);
|
||||||
|
|
||||||
|
if (node)
|
||||||
|
stack->firstNode[++stack->top] = node;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pop an item from the stack.
|
||||||
|
*/
|
||||||
|
Node* TY_(pop)(Stack *stack)
|
||||||
|
{
|
||||||
|
return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Peek at the stack.
|
||||||
|
*/
|
||||||
|
FUNC_UNUSED Node* TY_(peek)(Stack *stack)
|
||||||
|
{
|
||||||
|
return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Frees the stack when done.
|
||||||
|
*/
|
||||||
|
void TY_(freeStack)(Stack *stack)
|
||||||
|
{
|
||||||
|
TidyFree( stack->allocator, stack->firstNode );
|
||||||
|
stack->top = -1;
|
||||||
|
stack->capacity = 0;
|
||||||
|
stack->firstNode = NULL;
|
||||||
|
stack->allocator = NULL;
|
||||||
|
}
|
||||||
|
|
692
src/lexer.h
692
src/lexer.h
|
@ -1,33 +1,46 @@
|
||||||
#ifndef __LEXER_H__
|
#ifndef __LEXER_H__
|
||||||
#define __LEXER_H__
|
#define __LEXER_H__
|
||||||
|
|
||||||
/* lexer.h -- Lexer for html parser
|
|
||||||
|
|
||||||
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
|
/**************************************************************************//**
|
||||||
See tidy.h for the copyright notice.
|
* @file
|
||||||
|
* Lexer for HTML and XML Parsers.
|
||||||
Given an input source, it returns a sequence of tokens.
|
*
|
||||||
|
* Given an input source, it returns a sequence of tokens.
|
||||||
GetToken(source) gets the next token
|
*
|
||||||
UngetToken(source) provides one level undo
|
* GetToken(source) gets the next token
|
||||||
|
* UngetToken(source) provides one level undo
|
||||||
The tags include an attribute list:
|
*
|
||||||
|
* The tags include an attribute list:
|
||||||
- linked list of attribute/value nodes
|
*
|
||||||
- each node has 2 NULL-terminated strings.
|
* - linked list of attribute/value nodes
|
||||||
- entities are replaced in attribute values
|
* - each node has 2 NULL-terminated strings.
|
||||||
|
* - entities are replaced in attribute values
|
||||||
white space is compacted if not in preformatted mode
|
*
|
||||||
If not in preformatted mode then leading white space
|
* white space is compacted if not in preformatted mode
|
||||||
is discarded and subsequent white space sequences
|
* If not in preformatted mode then leading white space
|
||||||
compacted to single space characters.
|
* is discarded and subsequent white space sequences
|
||||||
|
* compacted to single space characters.
|
||||||
If XmlTags is no then Tag names are folded to upper
|
*
|
||||||
case and attribute names to lower case.
|
* If XmlTags is no then Tag names are folded to upper
|
||||||
|
* case and attribute names to lower case.
|
||||||
Not yet done:
|
*
|
||||||
- Doctype subset and marked sections
|
* Not yet done:
|
||||||
*/
|
* - Doctype subset and marked sections
|
||||||
|
*
|
||||||
|
* @author HTACG, et al (consult git log)
|
||||||
|
*
|
||||||
|
* @copyright
|
||||||
|
* (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG.
|
||||||
|
* See tidy.h for the copyright notice.
|
||||||
|
* @par
|
||||||
|
* All Rights Reserved.
|
||||||
|
* @par
|
||||||
|
* See `tidy.h` for the complete license.
|
||||||
|
*
|
||||||
|
* @date Additional updates: consult git log
|
||||||
|
*
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
@ -35,8 +48,23 @@ extern "C" {
|
||||||
|
|
||||||
#include "forward.h"
|
#include "forward.h"
|
||||||
|
|
||||||
/* lexer character types
|
/** @addtogroup internal_api */
|
||||||
*/
|
/** @{ */
|
||||||
|
|
||||||
|
|
||||||
|
/***************************************************************************//**
|
||||||
|
** @defgroup lexer_h HTML and XML Lexing
|
||||||
|
**
|
||||||
|
** These functions and structures form the internal API for document
|
||||||
|
** lexing.
|
||||||
|
**
|
||||||
|
** @{
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lexer character types.
|
||||||
|
*/
|
||||||
#define digit 1u
|
#define digit 1u
|
||||||
#define letter 2u
|
#define letter 2u
|
||||||
#define namechar 4u
|
#define namechar 4u
|
||||||
|
@ -47,8 +75,9 @@ extern "C" {
|
||||||
#define digithex 128u
|
#define digithex 128u
|
||||||
|
|
||||||
|
|
||||||
/* node->type is one of these values
|
/**
|
||||||
*/
|
* node->type is one of these values
|
||||||
|
*/
|
||||||
typedef enum
|
typedef enum
|
||||||
{
|
{
|
||||||
RootNode,
|
RootNode,
|
||||||
|
@ -68,9 +97,9 @@ typedef enum
|
||||||
} NodeType;
|
} NodeType;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
/* lexer GetToken states
|
* Lexer GetToken() states.
|
||||||
*/
|
*/
|
||||||
typedef enum
|
typedef enum
|
||||||
{
|
{
|
||||||
LEX_CONTENT,
|
LEX_CONTENT,
|
||||||
|
@ -88,7 +117,10 @@ typedef enum
|
||||||
LEX_XMLDECL
|
LEX_XMLDECL
|
||||||
} LexerState;
|
} LexerState;
|
||||||
|
|
||||||
/* ParseDocTypeDecl state constants */
|
|
||||||
|
/**
|
||||||
|
* ParseDocTypeDecl state constants.
|
||||||
|
*/
|
||||||
typedef enum
|
typedef enum
|
||||||
{
|
{
|
||||||
DT_INTERMEDIATE,
|
DT_INTERMEDIATE,
|
||||||
|
@ -98,67 +130,44 @@ typedef enum
|
||||||
DT_INTSUBSET
|
DT_INTSUBSET
|
||||||
} ParseDocTypeDeclState;
|
} ParseDocTypeDeclState;
|
||||||
|
|
||||||
/* content model shortcut encoding
|
|
||||||
|
|
||||||
Descriptions are tentative.
|
/**
|
||||||
*/
|
* Content model shortcut encoding.
|
||||||
|
* Descriptions are tentative.
|
||||||
|
*/
|
||||||
#define CM_UNKNOWN 0
|
#define CM_UNKNOWN 0
|
||||||
/* Elements with no content. Map to HTML specification. */
|
#define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */
|
||||||
#define CM_EMPTY (1 << 0)
|
#define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */
|
||||||
/* Elements that appear outside of "BODY". */
|
#define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */
|
||||||
#define CM_HTML (1 << 1)
|
#define CM_BLOCK (1 << 3) /**< HTML "block" elements. */
|
||||||
/* Elements that can appear within HEAD. */
|
#define CM_INLINE (1 << 4) /**< HTML "inline" elements. */
|
||||||
#define CM_HEAD (1 << 2)
|
#define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */
|
||||||
/* HTML "block" elements. */
|
#define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */
|
||||||
#define CM_BLOCK (1 << 3)
|
#define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */
|
||||||
/* HTML "inline" elements. */
|
#define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */
|
||||||
#define CM_INLINE (1 << 4)
|
#define CM_ROW (1 << 9) /**< Used for "TD", "TH" */
|
||||||
/* Elements that mark list item ("LI"). */
|
#define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */
|
||||||
#define CM_LIST (1 << 5)
|
#define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */
|
||||||
/* Elements that mark definition list item ("DL", "DT"). */
|
#define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */
|
||||||
#define CM_DEFLIST (1 << 6)
|
#define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
|
||||||
/* Elements that can appear inside TABLE. */
|
#define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */
|
||||||
#define CM_TABLE (1 << 7)
|
#define CM_OPT (1 << 15) /**< Elements with an optional end tag. */
|
||||||
/* Used for "THEAD", "TFOOT" or "TBODY". */
|
#define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */
|
||||||
#define CM_ROWGRP (1 << 8)
|
#define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */
|
||||||
/* Used for "TD", "TH" */
|
#define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */
|
||||||
#define CM_ROW (1 << 9)
|
#define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */
|
||||||
/* Elements whose content must be protected against white space movement.
|
#define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */
|
||||||
Includes some elements that can found in forms. */
|
#define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */
|
||||||
#define CM_FIELD (1 << 10)
|
|
||||||
/* Used to avoid propagating inline emphasis inside some elements
|
|
||||||
such as OBJECT or APPLET. */
|
|
||||||
#define CM_OBJECT (1 << 11)
|
|
||||||
/* Elements that allows "PARAM". */
|
|
||||||
#define CM_PARAM (1 << 12)
|
|
||||||
/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
|
|
||||||
#define CM_FRAMES (1 << 13)
|
|
||||||
/* Heading elements (h1, h2, ...). */
|
|
||||||
#define CM_HEADING (1 << 14)
|
|
||||||
/* Elements with an optional end tag. */
|
|
||||||
#define CM_OPT (1 << 15)
|
|
||||||
/* Elements that use "align" attribute for vertical position. */
|
|
||||||
#define CM_IMG (1 << 16)
|
|
||||||
/* Elements with inline and block model. Used to avoid calling InlineDup. */
|
|
||||||
#define CM_MIXED (1 << 17)
|
|
||||||
/* Elements whose content needs to be indented only if containing one
|
|
||||||
CM_BLOCK element. */
|
|
||||||
#define CM_NO_INDENT (1 << 18)
|
|
||||||
/* Elements that are obsolete (such as "dir", "menu"). */
|
|
||||||
#define CM_OBSOLETE (1 << 19)
|
|
||||||
/* User defined elements. Used to determine how attributes without value
|
|
||||||
should be printed. */
|
|
||||||
#define CM_NEW (1 << 20)
|
|
||||||
/* Elements that cannot be omitted. */
|
|
||||||
#define CM_OMITST (1 << 21)
|
|
||||||
|
|
||||||
/* If the document uses just HTML 2.0 tags and attributes described
|
|
||||||
** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
|
/**
|
||||||
** If there are proprietary tags and attributes then describe it as
|
* If the document uses just HTML 2.0 tags and attributes described
|
||||||
** HTML Proprietary. If it includes the xml-lang or xmlns attributes
|
* it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
|
||||||
** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
|
* If there are proprietary tags and attributes then describe it as
|
||||||
** flavors of Voyager (strict, loose or frameset).
|
* HTML Proprietary. If it includes the xml-lang or xmlns attributes
|
||||||
*/
|
* but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
|
||||||
|
* flavors of Voyager (strict, loose or frameset).
|
||||||
|
*/
|
||||||
|
|
||||||
/* unknown */
|
/* unknown */
|
||||||
#define xxxx 0u
|
#define xxxx 0u
|
||||||
|
@ -220,8 +229,10 @@ typedef enum
|
||||||
/* all proprietary types */
|
/* all proprietary types */
|
||||||
#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
|
#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
|
||||||
|
|
||||||
/* Linked list of class names and styles
|
|
||||||
*/
|
/**
|
||||||
|
* Linked list of class names and styles
|
||||||
|
*/
|
||||||
struct _Style;
|
struct _Style;
|
||||||
typedef struct _Style TagStyle;
|
typedef struct _Style TagStyle;
|
||||||
|
|
||||||
|
@ -234,8 +245,9 @@ struct _Style
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/* Linked list of style properties
|
/**
|
||||||
*/
|
* Linked list of style properties
|
||||||
|
*/
|
||||||
struct _StyleProp;
|
struct _StyleProp;
|
||||||
typedef struct _StyleProp StyleProp;
|
typedef struct _StyleProp StyleProp;
|
||||||
|
|
||||||
|
@ -247,11 +259,9 @@ struct _StyleProp
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attribute/Value linked list node
|
||||||
/* Attribute/Value linked list node
|
*/
|
||||||
*/
|
|
||||||
|
|
||||||
struct _AttVal
|
struct _AttVal
|
||||||
{
|
{
|
||||||
AttVal* next;
|
AttVal* next;
|
||||||
|
@ -264,93 +274,89 @@ struct _AttVal
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
/*
|
* Mosaic handles inlines via a separate stack from other elements
|
||||||
Mosaic handles inlines via a separate stack from other elements
|
* We duplicate this to recover from inline markup errors such as:
|
||||||
We duplicate this to recover from inline markup errors such as:
|
* ~~~
|
||||||
|
* <i>italic text
|
||||||
<i>italic text
|
* <p>more italic text</b> normal text
|
||||||
<p>more italic text</b> normal text
|
* ~~~
|
||||||
|
* which for compatibility with Mosaic is mapped to:
|
||||||
which for compatibility with Mosaic is mapped to:
|
* ~~~
|
||||||
|
* <i>italic text</i>
|
||||||
<i>italic text</i>
|
* <p><i>more italic text</i> normal text
|
||||||
<p><i>more italic text</i> normal text
|
* ~~~
|
||||||
|
* Note that any inline end tag pop's the effect of the current
|
||||||
Note that any inline end tag pop's the effect of the current
|
* inline start tag, so that `</b>` pop's `<i>` in the above example.
|
||||||
inline start tag, so that </b> pop's <i> in the above example.
|
|
||||||
*/
|
*/
|
||||||
struct _IStack
|
struct _IStack
|
||||||
{
|
{
|
||||||
IStack* next;
|
IStack* next;
|
||||||
const Dict* tag; /* tag's dictionary definition */
|
const Dict* tag; /**< tag's dictionary definition */
|
||||||
tmbstr element; /* name (NULL for text nodes) */
|
tmbstr element; /**< name (NULL for text nodes) */
|
||||||
AttVal* attributes;
|
AttVal* attributes;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
|
/**
|
||||||
** etc. etc.
|
* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
struct _Node
|
struct _Node
|
||||||
{
|
{
|
||||||
Node* parent; /* tree structure */
|
Node* parent; /**< tree structure */
|
||||||
Node* prev;
|
Node* prev;
|
||||||
Node* next;
|
Node* next;
|
||||||
Node* content;
|
Node* content;
|
||||||
Node* last;
|
Node* last;
|
||||||
|
|
||||||
AttVal* attributes;
|
AttVal* attributes;
|
||||||
const Dict* was; /* old tag when it was changed */
|
const Dict* was; /**< old tag when it was changed */
|
||||||
const Dict* tag; /* tag's dictionary definition */
|
const Dict* tag; /**< tag's dictionary definition */
|
||||||
|
|
||||||
tmbstr element; /* name (NULL for text nodes) */
|
tmbstr element; /**< name (NULL for text nodes) */
|
||||||
|
|
||||||
uint start; /* start of span onto text array */
|
uint start; /**< start of span onto text array */
|
||||||
uint end; /* end of span onto text array */
|
uint end; /**< end of span onto text array */
|
||||||
NodeType type; /* TextNode, StartTag, EndTag etc. */
|
NodeType type; /**< TextNode, StartTag, EndTag etc. */
|
||||||
|
|
||||||
uint line; /* current line of document */
|
uint line; /**< current line of document */
|
||||||
uint column; /* current column of document */
|
uint column; /**< current column of document */
|
||||||
|
|
||||||
Bool closed; /* true if closed by explicit end tag */
|
Bool closed; /**< true if closed by explicit end tag */
|
||||||
Bool implicit; /* true if inferred */
|
Bool implicit; /**< true if inferred */
|
||||||
Bool linebreak; /* true if followed by a line break */
|
Bool linebreak; /**< true if followed by a line break */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/*
|
/**
|
||||||
The following are private to the lexer
|
* The following are private to the lexer.
|
||||||
Use NewLexer() to create a lexer, and
|
* Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it.
|
||||||
FreeLexer() to free it.
|
*/
|
||||||
*/
|
|
||||||
|
|
||||||
struct _Lexer
|
struct _Lexer
|
||||||
{
|
{
|
||||||
uint lines; /* lines seen */
|
uint lines; /**< lines seen */
|
||||||
uint columns; /* at start of current token */
|
uint columns; /**< at start of current token */
|
||||||
Bool waswhite; /* used to collapse contiguous white space */
|
Bool waswhite; /**< used to collapse contiguous white space */
|
||||||
Bool pushed; /* true after token has been pushed back */
|
Bool pushed; /**< true after token has been pushed back */
|
||||||
Bool insertspace; /* when space is moved after end tag */
|
Bool insertspace; /**< when space is moved after end tag */
|
||||||
Bool excludeBlocks; /* Netscape compatibility */
|
Bool excludeBlocks; /**< Netscape compatibility */
|
||||||
Bool exiled; /* true if moved out of table */
|
Bool exiled; /**< true if moved out of table */
|
||||||
Bool isvoyager; /* true if xmlns attribute on html element */
|
Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */
|
||||||
uint versions; /* bit vector of HTML versions */
|
uint versions; /**< bit vector of HTML versions */
|
||||||
uint doctype; /* version as given by doctype (if any) */
|
uint doctype; /**< version as given by doctype (if any) */
|
||||||
uint versionEmitted; /* version of doctype emitted */
|
uint versionEmitted; /**< version of doctype emitted */
|
||||||
Bool bad_doctype; /* e.g. if html or PUBLIC is missing */
|
Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */
|
||||||
uint txtstart; /* start of current node */
|
uint txtstart; /**< start of current node */
|
||||||
uint txtend; /* end of current node */
|
uint txtend; /**< end of current node */
|
||||||
LexerState state; /* state of lexer's finite state machine */
|
LexerState state; /**< state of lexer's finite state machine */
|
||||||
|
|
||||||
Node* token; /* last token returned by GetToken() */
|
Node* token; /**< last token returned by GetToken() */
|
||||||
Node* itoken; /* last duplicate inline returned by GetToken() */
|
Node* itoken; /**< last duplicate inline returned by GetToken() */
|
||||||
Node* root; /* remember root node of the document */
|
Node* root; /**< remember root node of the document */
|
||||||
Node* parent; /* remember parent node for CDATA elements */
|
Node* parent; /**< remember parent node for CDATA elements */
|
||||||
|
|
||||||
Bool seenEndBody; /* true if a </body> tag has been encountered */
|
Bool seenEndBody; /**< true if a `</body>` tag has been encountered */
|
||||||
Bool seenEndHtml; /* true if a </html> tag has been encountered */
|
Bool seenEndHtml; /**< true if a `</html>` tag has been encountered */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Lexer character buffer
|
Lexer character buffer
|
||||||
|
@ -361,33 +367,57 @@ struct _Lexer
|
||||||
|
|
||||||
lexsize must be reset for each file.
|
lexsize must be reset for each file.
|
||||||
*/
|
*/
|
||||||
tmbstr lexbuf; /* MB character buffer */
|
tmbstr lexbuf; /**< MB character buffer */
|
||||||
uint lexlength; /* allocated */
|
uint lexlength; /**< allocated */
|
||||||
uint lexsize; /* used */
|
uint lexsize; /**< used */
|
||||||
|
|
||||||
/* Inline stack for compatibility with Mosaic */
|
/* Inline stack for compatibility with Mosaic */
|
||||||
Node* inode; /* for deferring text node */
|
Node* inode; /**< for deferring text node */
|
||||||
IStack* insert; /* for inferring inline tags */
|
IStack* insert; /**< for inferring inline tags */
|
||||||
IStack* istack;
|
IStack* istack;
|
||||||
uint istacklength; /* allocated */
|
uint istacklength; /**< allocated */
|
||||||
uint istacksize; /* used */
|
uint istacksize; /**< used */
|
||||||
uint istackbase; /* start of frame */
|
uint istackbase; /**< start of frame */
|
||||||
|
|
||||||
TagStyle *styles; /* used for cleaning up presentation markup */
|
TagStyle *styles; /**< used for cleaning up presentation markup */
|
||||||
|
|
||||||
TidyAllocator* allocator; /* allocator */
|
TidyAllocator* allocator; /**< allocator */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/* Lexer Functions
|
/**
|
||||||
*/
|
* modes for GetToken()
|
||||||
|
*
|
||||||
|
* MixedContent -- for elements which don't accept PCDATA
|
||||||
|
* Preformatted -- white space preserved as is
|
||||||
|
* IgnoreMarkup -- for CDATA elements such as script, style
|
||||||
|
*/
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
IgnoreWhitespace,
|
||||||
|
MixedContent,
|
||||||
|
Preformatted,
|
||||||
|
IgnoreMarkup,
|
||||||
|
OtherNamespace,
|
||||||
|
CdataContent
|
||||||
|
} GetTokenMode;
|
||||||
|
|
||||||
/* choose what version to use for new doctype */
|
|
||||||
|
/** @name Lexer Functions
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Choose what version to use for new doctype
|
||||||
|
*/
|
||||||
TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc );
|
TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc );
|
||||||
|
|
||||||
/* everything is allowed in proprietary version of HTML */
|
|
||||||
/* this is handled here rather than in the tag/attr dicts */
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Everything is allowed in proprietary version of HTML.
|
||||||
|
* This is handled here rather than in the tag/attr dicts
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
|
TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
|
||||||
|
|
||||||
TY_PRIVATE Bool TY_(IsWhite)(uint c);
|
TY_PRIVATE Bool TY_(IsWhite)(uint c);
|
||||||
|
@ -399,7 +429,6 @@ TY_PRIVATE Bool TY_(IsNamechar)(uint c);
|
||||||
TY_PRIVATE Bool TY_(IsXMLLetter)(uint c);
|
TY_PRIVATE Bool TY_(IsXMLLetter)(uint c);
|
||||||
TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c);
|
TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c);
|
||||||
|
|
||||||
/* Bool IsLower(uint c); */
|
|
||||||
TY_PRIVATE Bool TY_(IsUpper)(uint c);
|
TY_PRIVATE Bool TY_(IsUpper)(uint c);
|
||||||
TY_PRIVATE uint TY_(ToLower)(uint c);
|
TY_PRIVATE uint TY_(ToLower)(uint c);
|
||||||
TY_PRIVATE uint TY_(ToUpper)(uint c);
|
TY_PRIVATE uint TY_(ToUpper)(uint c);
|
||||||
|
@ -407,60 +436,82 @@ TY_PRIVATE uint TY_(ToUpper)(uint c);
|
||||||
TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc );
|
TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc );
|
||||||
TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc );
|
TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc );
|
||||||
|
|
||||||
/* store character c as UTF-8 encoded byte stream */
|
|
||||||
|
/**
|
||||||
|
* Store character c as UTF-8 encoded byte stream
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c );
|
TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c );
|
||||||
|
|
||||||
/*
|
|
||||||
Used for elements and text nodes
|
|
||||||
element name is NULL for text nodes
|
|
||||||
start and end are offsets into lexbuf
|
|
||||||
which contains the textual content of
|
|
||||||
all elements in the parse tree.
|
|
||||||
|
|
||||||
parent and content allow traversal
|
/**
|
||||||
of the parse tree in any direction.
|
* Used for elements and text nodes.
|
||||||
attributes are represented as a linked
|
* - Element name is NULL for text nodes.
|
||||||
list of AttVal nodes which hold the
|
* - start and end are offsets into lexbuf,
|
||||||
strings for attribute/value pairs.
|
* which contains the textual content of
|
||||||
|
* all elements in the parse tree.
|
||||||
|
* - parent and content allow traversal
|
||||||
|
* of the parse tree in any direction.
|
||||||
|
* - attributes are represented as a linked
|
||||||
|
* list of AttVal nodes which hold the
|
||||||
|
* strings for attribute/value pairs.
|
||||||
*/
|
*/
|
||||||
TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
|
TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
|
||||||
|
|
||||||
|
|
||||||
/* used to clone heading nodes when split by an <HR> */
|
/**
|
||||||
|
* Used to clone heading nodes when split by an `<HR>`
|
||||||
|
*/
|
||||||
TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
|
TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
|
||||||
|
|
||||||
/* free node's attributes */
|
|
||||||
|
/**
|
||||||
|
* Free node's attributes
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
|
TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
|
||||||
|
|
||||||
/* doesn't repair attribute list linkage */
|
|
||||||
|
/**
|
||||||
|
* Doesn't repair attribute list linkage
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
|
TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
|
||||||
|
|
||||||
/* detach attribute from node */
|
|
||||||
|
/**
|
||||||
|
* Detach attribute from node
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr );
|
TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr );
|
||||||
|
|
||||||
/* detach attribute from node then free it
|
|
||||||
*/
|
/**
|
||||||
|
* Detach attribute from node then free it.
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
|
TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
|
||||||
|
|
||||||
/*
|
|
||||||
Free document nodes by iterating through peers and recursing
|
/**
|
||||||
through children. Set next to NULL before calling FreeNode()
|
* Free document nodes by iterating through peers and recursing
|
||||||
to avoid freeing peer nodes. Doesn't patch up prev/next links.
|
* through children. Set `next` to `NULL` before calling `FreeNode()`
|
||||||
|
* to avoid freeing peer nodes. Doesn't patch up prev/next links.
|
||||||
*/
|
*/
|
||||||
TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
|
TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
|
||||||
|
|
||||||
|
|
||||||
TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer );
|
TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer );
|
||||||
|
|
||||||
/* used for creating preformatted text from Word2000 */
|
|
||||||
|
/**
|
||||||
|
* Used for creating preformatted text from Word2000.
|
||||||
|
*/
|
||||||
TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer );
|
TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer );
|
||||||
|
|
||||||
/* used for adding a for Word2000 */
|
|
||||||
|
/**
|
||||||
|
* Used for adding a for Word2000.
|
||||||
|
*/
|
||||||
TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
|
TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
|
||||||
|
|
||||||
TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
|
|
||||||
/* TY_PRIVATE void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
|
|
||||||
|
|
||||||
/* find element */
|
TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
|
||||||
TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc );
|
TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc );
|
||||||
TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc );
|
TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc );
|
||||||
TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc );
|
TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc );
|
||||||
|
@ -468,10 +519,16 @@ TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc);
|
||||||
TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc );
|
TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc );
|
||||||
TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
|
TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
|
||||||
|
|
||||||
/* Returns containing block element, if any */
|
|
||||||
|
/**
|
||||||
|
* Returns containing block element, if any
|
||||||
|
*/
|
||||||
TY_PRIVATE Node* TY_(FindContainer)( Node* node );
|
TY_PRIVATE Node* TY_(FindContainer)( Node* node );
|
||||||
|
|
||||||
/* add meta element for Tidy */
|
|
||||||
|
/**
|
||||||
|
* Add meta element for Tidy.
|
||||||
|
*/
|
||||||
TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc );
|
TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc );
|
||||||
|
|
||||||
TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc );
|
TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc );
|
||||||
|
@ -485,118 +542,209 @@ TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
|
||||||
TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
|
TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
|
||||||
|
|
||||||
|
|
||||||
/* fixup doctype if missing */
|
/**
|
||||||
|
* Fixup doctype if missing.
|
||||||
|
*/
|
||||||
TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc );
|
TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc );
|
||||||
|
|
||||||
/* ensure XML document starts with <?xml version="1.0"?> */
|
|
||||||
/* add encoding attribute if not using ASCII or UTF-8 output */
|
/**
|
||||||
|
* Ensure XML document starts with <?xml version="1.0"?>,and
|
||||||
|
* add encoding attribute if not using ASCII or UTF-8 output.
|
||||||
|
*/
|
||||||
TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
|
TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
|
||||||
|
|
||||||
|
|
||||||
TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
|
TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
|
||||||
|
|
||||||
TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc );
|
TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc );
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
modes for GetToken()
|
|
||||||
|
|
||||||
MixedContent -- for elements which don't accept PCDATA
|
|
||||||
Preformatted -- white space preserved as is
|
|
||||||
IgnoreMarkup -- for CDATA elements such as script, style
|
|
||||||
*/
|
|
||||||
typedef enum
|
|
||||||
{
|
|
||||||
IgnoreWhitespace,
|
|
||||||
MixedContent,
|
|
||||||
Preformatted,
|
|
||||||
IgnoreMarkup,
|
|
||||||
OtherNamespace,
|
|
||||||
CdataContent
|
|
||||||
} GetTokenMode;
|
|
||||||
|
|
||||||
TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
|
TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
|
||||||
|
|
||||||
TY_PRIVATE void TY_(InitMap)(void);
|
TY_PRIVATE void TY_(InitMap)(void);
|
||||||
|
|
||||||
|
|
||||||
/* create a new attribute */
|
/**
|
||||||
|
* Create a new attribute.
|
||||||
|
*/
|
||||||
TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
|
TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
|
||||||
|
|
||||||
/* create a new attribute with given name and value */
|
|
||||||
|
/**
|
||||||
|
* Create a new attribute with given name and value.
|
||||||
|
*/
|
||||||
TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
|
TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
|
||||||
int delim );
|
int delim );
|
||||||
|
|
||||||
/* insert attribute at the end of attribute list of a node */
|
|
||||||
|
/**
|
||||||
|
* Insert attribute at the end of attribute list of a node.
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
|
TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
|
||||||
|
|
||||||
/* insert attribute at the start of attribute list of a node */
|
/**
|
||||||
|
* Insert attribute at the start of attribute list of a node.
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
|
TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
|
||||||
|
|
||||||
/*************************************
|
|
||||||
In-line Stack functions
|
/** @}
|
||||||
*************************************/
|
* @name Inline Stack Functions
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
/* duplicate attributes */
|
/**
|
||||||
|
* Duplicate attributes.
|
||||||
|
*/
|
||||||
TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
|
TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
|
||||||
|
|
||||||
/*
|
|
||||||
push a copy of an inline node onto stack
|
|
||||||
but don't push if implicit or OBJECT or APPLET
|
|
||||||
(implicit tags are ones generated from the istack)
|
|
||||||
|
|
||||||
One issue arises with pushing inlines when
|
/**
|
||||||
the tag is already pushed. For instance:
|
* Push a copy of an inline node onto stack, but don't push if
|
||||||
|
* implicit or OBJECT or APPLET (implicit tags are ones generated
|
||||||
<p><em>text
|
* from the istack).
|
||||||
<p><em>more text
|
*
|
||||||
|
* One issue arises with pushing inlines when the tag is already pushed.
|
||||||
Shouldn't be mapped to
|
* For instance:
|
||||||
|
* ~~~
|
||||||
<p><em>text</em></p>
|
* <p><em>text
|
||||||
<p><em><em>more text</em></em>
|
* <p><em>more text
|
||||||
*/
|
* ~~~
|
||||||
|
* Shouldn't be mapped to
|
||||||
|
* ~~~
|
||||||
|
* <p><em>text</em></p>
|
||||||
|
* <p><em><em>more text</em></em>
|
||||||
|
* ~~~
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node );
|
TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node );
|
||||||
|
|
||||||
/* pop inline stack */
|
|
||||||
|
/**
|
||||||
|
* Pop inline stack.
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node );
|
TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node );
|
||||||
|
|
||||||
|
|
||||||
TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
|
TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
|
||||||
TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
|
TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
|
||||||
|
|
||||||
/*
|
|
||||||
This has the effect of inserting "missing" inline
|
|
||||||
elements around the contents of blocklevel elements
|
|
||||||
such as P, TD, TH, DIV, PRE etc. This procedure is
|
|
||||||
called at the start of ParseBlock. when the inline
|
|
||||||
stack is not empty, as will be the case in:
|
|
||||||
|
|
||||||
<i><h1>italic heading</h1></i>
|
/**
|
||||||
|
* This has the effect of inserting "missing" inline elements around the
|
||||||
which is then treated as equivalent to
|
* contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This
|
||||||
|
* procedure is called at the start of `ParseBlock`, when the inline
|
||||||
<h1><i>italic heading</i></h1>
|
* stack is not empty, as will be the case in:
|
||||||
|
* ~~~
|
||||||
This is implemented by setting the lexer into a mode
|
* <i><h1>italic heading</h1></i>
|
||||||
where it gets tokens from the inline stack rather than
|
* ~~~
|
||||||
from the input stream.
|
* which is then treated as equivalent to
|
||||||
*/
|
* ~~~
|
||||||
|
* <h1><i>italic heading</i></h1>
|
||||||
|
* ~~~
|
||||||
|
* This is implemented by setting the lexer into a mode where it gets
|
||||||
|
* tokens from the inline stack rather than from the input stream.
|
||||||
|
*/
|
||||||
TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
|
TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
|
||||||
|
|
||||||
/*
|
|
||||||
defer duplicates when entering a table or other
|
/**
|
||||||
element where the inlines shouldn't be duplicated
|
* Fefer duplicates when entering a table or other
|
||||||
*/
|
* element where the inlines shouldn't be duplicated.
|
||||||
|
*/
|
||||||
TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
|
TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
|
||||||
|
|
||||||
|
|
||||||
TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc );
|
TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc );
|
||||||
|
|
||||||
/* stack manipulation for inline elements */
|
/**
|
||||||
|
* Stack manipulation for inline elements
|
||||||
|
*/
|
||||||
TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
|
TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
|
||||||
|
|
||||||
|
|
||||||
TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
|
TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
|
||||||
|
|
||||||
|
|
||||||
|
/** @}
|
||||||
|
* @name Generic stack of nodes.
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This typedef represents a stack of addresses to nodes. Tidy uses these to
|
||||||
|
* try to limit recursion by pushing nodes to a stack when possible instead
|
||||||
|
* of recursing.
|
||||||
|
*/
|
||||||
|
typedef struct _Stack {
|
||||||
|
int top; /**< Current top position. */
|
||||||
|
unsigned capacity; /**< Current capacity. Can be expanded. */
|
||||||
|
Node **firstNode; /** A pointer to the first pointer to a Node in an array of node addresses. */
|
||||||
|
TidyAllocator* allocator; /**< Tidy's allocator, used at instantiation and expanding. */
|
||||||
|
} Stack;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new stack with a given starting capacity. If memory allocation
|
||||||
|
* fails, then the allocator will panic the program automatically.
|
||||||
|
*/
|
||||||
|
TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increase the stack size. This will be called automatically when the
|
||||||
|
* current stack is full. If memory allocation fails, then the allocator
|
||||||
|
* will panic the program automatically.
|
||||||
|
*/
|
||||||
|
TY_PRIVATE void TY_(growStack)(Stack *stack);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stack is full when top is equal to the last index.
|
||||||
|
*/
|
||||||
|
TY_PRIVATE Bool TY_(stackFull)(Stack *stack);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stack is empty when top is equal to -1
|
||||||
|
*/
|
||||||
|
TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Push an item to the stack.
|
||||||
|
*/
|
||||||
|
TY_PRIVATE void TY_(push)(Stack *stack, Node *node);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pop an item from the stack.
|
||||||
|
*/
|
||||||
|
TY_PRIVATE Node* TY_(pop)(Stack *stack);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Peek at the stack.
|
||||||
|
*/
|
||||||
|
TY_PRIVATE Node* TY_(peek)(Stack *stack);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Frees the stack when done.
|
||||||
|
*/
|
||||||
|
TY_PRIVATE void TY_(freeStack)(Stack *stack);
|
||||||
|
|
||||||
|
|
||||||
|
/** @}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/** @} end parser_h group */
|
||||||
|
/** @} end internal_api group */
|
||||||
|
|
||||||
#endif /* __LEXER_H__ */
|
#endif /* __LEXER_H__ */
|
||||||
|
|
4090
src/parser.c
4090
src/parser.c
File diff suppressed because it is too large
Load diff
71
src/parser.h
71
src/parser.h
|
@ -41,6 +41,74 @@
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The parsers keeps track of their states with the states defined here, and
|
||||||
|
* use these symbols when pushing to the stack so that they can later recreate
|
||||||
|
* their environments when re-entered.
|
||||||
|
*/
|
||||||
|
typedef enum {
|
||||||
|
/* Universal states. */
|
||||||
|
STATE_INITIAL, /**< This is the initial state for every parser. */
|
||||||
|
STATE_COMPLETE, /**< Complete! */
|
||||||
|
STATE_PARSE_TAG,
|
||||||
|
STATE_PARSE_TAG_DONE,
|
||||||
|
/* ParseHTML states. */
|
||||||
|
STATE_PRE_HEAD, /**< In this state, we've not detected head yet. */
|
||||||
|
STATE_PRE_BODY, /**< In this state, we'll consider frames vs. body. */
|
||||||
|
STATE_PARSE_BODY, /**< In this state, we can parse the body. */
|
||||||
|
STATE_PARSE_HEAD, /**< In this state, we will setup head for parsing. */
|
||||||
|
STATE_PARSE_HEAD_DONE, /**< Resume here after parsing head. */
|
||||||
|
STATE_PARSE_NOFRAMES, /**< In this state, we can parse noframes content. */
|
||||||
|
STATE_PARSE_NOFRAMES_DONE, /**< In this state, we can restore more state. */
|
||||||
|
STATE_PARSE_FRAMESET, /**< In this state, we will parse frameset content. */
|
||||||
|
STATE_PARSE_FRAMESET_DONE, /**< We need to cleanup some things after parsing frameset. */
|
||||||
|
} parserState;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This typedef represents the state of a parser when it enters and exits.
|
||||||
|
* When the parser needs to finish work on the way back up the stack, it will
|
||||||
|
* push one of these records to the stack, and it will pop a record from the
|
||||||
|
* stack upon re-entry.
|
||||||
|
*/
|
||||||
|
typedef struct _TidyParserMemory
|
||||||
|
{
|
||||||
|
Parser *identity; /**< Which parser pushed this record? */
|
||||||
|
Node *original_node; /**< Originally provided node at entry. */
|
||||||
|
Node *reentry_node; /**< A node a parser might want to save. */
|
||||||
|
GetTokenMode reentry_mode; /**< The mode to use for the next node. */
|
||||||
|
parserState reentry_state; /**< State to set during re-entry. */
|
||||||
|
GetTokenMode mode; /**< The caller will peek at this value to get the correct mode. */
|
||||||
|
} TidyParserMemory;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This typedef represents a stack of parserState. The Tidy document has its
|
||||||
|
* own instance of this.
|
||||||
|
*/
|
||||||
|
typedef struct _TidyParserStack
|
||||||
|
{
|
||||||
|
TidyParserMemory* content; /**< A state record. */
|
||||||
|
TidyAllocator* allocator; /**< The allocator used for creating. */
|
||||||
|
uint size; /**< Current size of the stack. */
|
||||||
|
int top; /**< Top of the stack. */
|
||||||
|
} TidyParserStack;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allocates and initializes the parser's stack. TidyCreate will perform
|
||||||
|
* this automatically.
|
||||||
|
*/
|
||||||
|
void TY_(InitParserStack)( TidyDocImpl* doc );
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Frees the parser's stack when done. TidyRelease will perform this
|
||||||
|
* automatically.
|
||||||
|
*/
|
||||||
|
void TY_(FreeParserStack)( TidyDocImpl* doc );
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Is used to perform a node integrity check recursively after parsing
|
* Is used to perform a node integrity check recursively after parsing
|
||||||
* an HTML or XML document.
|
* an HTML or XML document.
|
||||||
|
@ -96,7 +164,7 @@ TY_PRIVATE Node *TY_(RemoveNode)(Node *node);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove node from markup tree and discard it.
|
* Remove node from markup tree and discard it.
|
||||||
* @param doc The Tidy document from which to discarb the node.
|
* @param doc The Tidy document from which to discard the node.
|
||||||
* @param element The node to discard.
|
* @param element The node to discard.
|
||||||
* @returns Returns the next node.
|
* @returns Returns the next node.
|
||||||
*/
|
*/
|
||||||
|
@ -202,4 +270,3 @@ TY_PRIVATE void TY_(ParseXMLDocument)( TidyDocImpl* doc );
|
||||||
/** @} end internal_api group */
|
/** @} end internal_api group */
|
||||||
|
|
||||||
#endif /* __PARSER_H__ */
|
#endif /* __PARSER_H__ */
|
||||||
|
|
||||||
|
|
|
@ -168,7 +168,7 @@ static CheckAttribs CheckHTML;
|
||||||
\*/
|
\*/
|
||||||
static Dict tag_defs[] =
|
static Dict tag_defs[] =
|
||||||
{
|
{
|
||||||
{ TidyTag_UNKNOWN, "unknown!", VERS_UNKNOWN, NULL, (0), NULL, NULL },
|
{ TidyTag_UNKNOWN, "unknown!", VERS_UNKNOWN, NULL, (0), NULL, NULL },
|
||||||
|
|
||||||
/* W3C defined elements */
|
/* W3C defined elements */
|
||||||
{ TidyTag_A, "a", VERS_ELEM_A, &TY_(W3CAttrsFor_A)[0], (CM_INLINE|CM_BLOCK|CM_MIXED), TY_(ParseBlock), NULL }, /* Issue #167 & #169 - default HTML5 */
|
{ TidyTag_A, "a", VERS_ELEM_A, &TY_(W3CAttrsFor_A)[0], (CM_INLINE|CM_BLOCK|CM_MIXED), TY_(ParseBlock), NULL }, /* Issue #167 & #169 - default HTML5 */
|
||||||
|
@ -332,7 +332,7 @@ static Dict tag_defs[] =
|
||||||
{ TidyTag_WBR, "wbr", VERS_ELEM_WBR, &TY_(W3CAttrsFor_WBR)[0], (CM_INLINE|CM_EMPTY), TY_(ParseEmpty), NULL },
|
{ TidyTag_WBR, "wbr", VERS_ELEM_WBR, &TY_(W3CAttrsFor_WBR)[0], (CM_INLINE|CM_EMPTY), TY_(ParseEmpty), NULL },
|
||||||
|
|
||||||
/* this must be the final entry */
|
/* this must be the final entry */
|
||||||
{ (TidyTagId)0, NULL, 0, NULL, (0), NULL, NULL }
|
{ (TidyTagId)0, NULL, 0, NULL, (0), NULL, NULL }
|
||||||
};
|
};
|
||||||
|
|
||||||
static uint tagsHash(ctmbstr s)
|
static uint tagsHash(ctmbstr s)
|
||||||
|
|
|
@ -61,8 +61,13 @@ typedef enum
|
||||||
|
|
||||||
|
|
||||||
/** This typedef describes a function to be used to parse HTML of a Tidy tag.
|
/** This typedef describes a function to be used to parse HTML of a Tidy tag.
|
||||||
|
** @param doc The Tidy document.
|
||||||
|
** @param node The node being parsed.
|
||||||
|
** @param mode The GetTokenMode to be used for parsing the node contents.
|
||||||
|
** @param popStack A flag indicating that we are re-entering this parser, and
|
||||||
|
** it should restore a state from the stack.
|
||||||
*/
|
*/
|
||||||
typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
|
typedef Node* (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode, Bool popStack );
|
||||||
|
|
||||||
|
|
||||||
/** This typedef describes a function be be used to check the attributes
|
/** This typedef describes a function be be used to check the attributes
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
#include "pprint.h"
|
#include "pprint.h"
|
||||||
#include "access.h"
|
#include "access.h"
|
||||||
#include "message.h"
|
#include "message.h"
|
||||||
|
#include "parser.h"
|
||||||
|
|
||||||
#ifndef MAX
|
#ifndef MAX
|
||||||
#define MAX(a,b) (((a) > (b))?(a):(b))
|
#define MAX(a,b) (((a) > (b))?(a):(b))
|
||||||
|
@ -41,19 +42,20 @@ struct _TidyDocImpl
|
||||||
Lexer* lexer;
|
Lexer* lexer;
|
||||||
|
|
||||||
/* Config + Markup Declarations */
|
/* Config + Markup Declarations */
|
||||||
TidyConfigImpl config;
|
TidyConfigImpl config;
|
||||||
TidyTagImpl tags;
|
TidyTagImpl tags;
|
||||||
TidyAttribImpl attribs;
|
TidyAttribImpl attribs;
|
||||||
TidyAccessImpl access;
|
TidyAccessImpl access;
|
||||||
TidyMutedMessages muted;
|
TidyMutedMessages muted;
|
||||||
|
|
||||||
/* The Pretty Print buffer */
|
/* The Pretty Print buffer */
|
||||||
TidyPrintImpl pprint;
|
TidyPrintImpl pprint;
|
||||||
|
|
||||||
/* I/O */
|
/* I/O */
|
||||||
StreamIn* docIn;
|
StreamIn* docIn;
|
||||||
StreamOut* docOut;
|
StreamOut* docOut;
|
||||||
StreamOut* errout;
|
StreamOut* errout;
|
||||||
|
|
||||||
TidyReportFilter reportFilter;
|
TidyReportFilter reportFilter;
|
||||||
TidyReportCallback reportCallback;
|
TidyReportCallback reportCallback;
|
||||||
TidyMessageCallback messageCallback;
|
TidyMessageCallback messageCallback;
|
||||||
|
@ -62,6 +64,8 @@ struct _TidyDocImpl
|
||||||
TidyConfigChangeCallback pConfigChangeCallback;
|
TidyConfigChangeCallback pConfigChangeCallback;
|
||||||
TidyPPProgress progressCallback;
|
TidyPPProgress progressCallback;
|
||||||
|
|
||||||
|
TidyParserStack stack;
|
||||||
|
|
||||||
/* Parse + Repair Results */
|
/* Parse + Repair Results */
|
||||||
uint optionErrors;
|
uint optionErrors;
|
||||||
uint errors;
|
uint errors;
|
||||||
|
|
|
@ -112,6 +112,7 @@ TidyDocImpl* tidyDocCreate( TidyAllocator *allocator )
|
||||||
TY_(InitAttrs)( doc );
|
TY_(InitAttrs)( doc );
|
||||||
TY_(InitConfig)( doc );
|
TY_(InitConfig)( doc );
|
||||||
TY_(InitPrintBuf)( doc );
|
TY_(InitPrintBuf)( doc );
|
||||||
|
TY_(InitParserStack)( doc );
|
||||||
|
|
||||||
/* Set the locale for tidy's output. This both configures
|
/* Set the locale for tidy's output. This both configures
|
||||||
** LibTidy to use the environment's locale as well as the
|
** LibTidy to use the environment's locale as well as the
|
||||||
|
@ -172,6 +173,7 @@ void tidyDocRelease( TidyDocImpl* doc )
|
||||||
* to determine which hash is to be used, so free it last.
|
* to determine which hash is to be used, so free it last.
|
||||||
\*/
|
\*/
|
||||||
TY_(FreeLexer)( doc );
|
TY_(FreeLexer)( doc );
|
||||||
|
TY_(FreeParserStack)( doc );
|
||||||
TidyDocFree( doc, doc );
|
TidyDocFree( doc, doc );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue