From 885c7caab7cd2fc2ea550662dbb325d2fd5f17cc Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Mon, 2 Feb 2015 17:25:49 +0100 Subject: [PATCH] Issue #70 - Initial implmentation of SVG support. An immense thanks to Ger Hobbelt who had already done this in his github.com/GerHobbelt/htmltidy fork. The two sources have diverges so was not a simple cut an paste. But again thanks Ger for this. --- include/tidyenum.h | 12 +++- src/attrdict.c | 35 +++++++++ src/attrdict.h | 1 + src/attrs.c | 18 +++++ src/lexer.h | 1 + src/parser.c | 173 +++++++++++++++++++++++++++++++++++++++++++++ src/tags.c | 37 ++++++++++ src/tags.h | 1 + src/tidy-int.h | 22 ++++++ 9 files changed, 298 insertions(+), 2 deletions(-) diff --git a/include/tidyenum.h b/include/tidyenum.h index 6ccce2a..f0696c5 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -423,6 +423,7 @@ typedef enum TidyTag_STYLE, /**< STYLE */ TidyTag_SUB, /**< SUB */ TidyTag_SUP, /**< SUP */ + TidyTag_SVG, /**< SVG (HTML5) */ TidyTag_TABLE, /**< TABLE */ TidyTag_TBODY, /**< TBODY */ TidyTag_TD, /**< TD */ @@ -783,8 +784,15 @@ typedef enum TidyAttr_ARIA_VALUENOW, TidyAttr_ARIA_VALUETEXT, - - + /* SVG attributes (SVG 1.1) */ + TidyAttr_X, /**< X= */ + TidyAttr_Y, /**< Y= */ + TidyAttr_VIEWBOX, /**< VIEWBOX= */ + TidyAttr_PRESERVEASPECTRATIO, /**< PRESERVEASPECTRATIO= */ + TidyAttr_ZOOMANDPAN, /**< ZOOMANDPAN= */ + TidyAttr_BASEPROFILE, /**< BASEPROFILE= */ + TidyAttr_CONTENTSCRIPTTYPE, /**< CONTENTSCRIPTTYPE= */ + TidyAttr_CONTENTSTYLETYPE, /**< CONTENTSTYLETYPE= */ N_TIDY_ATTRIBS /**< Must be last */ } TidyAttrId; diff --git a/src/attrdict.c b/src/attrdict.c index cb45f78..ddddaca 100644 --- a/src/attrdict.c +++ b/src/attrdict.c @@ -11693,6 +11693,41 @@ const AttrVersion TY_(W3CAttrsFor_SUP)[] = { TidyAttr_UNKNOWN, 0 }, }; +const AttrVersion TY_(W3CAttrsFor_SVG)[] = +{ + { TidyAttr_ALIGN, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_CLASS, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_DIR, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_ID, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_HEIGHT, xxxx|HT32|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|XB10|HT50|XH50 }, + { TidyAttr_LANG, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|xxxx|xxxx|HT50|XH50 }, + { TidyAttr_OnCLICK, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_OnDBLCLICK, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_OnKEYDOWN, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_OnKEYPRESS, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_OnKEYUP, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_OnMOUSEDOWN, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_OnMOUSEMOVE, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_OnMOUSEOUT, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_OnMOUSEOVER, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_OnMOUSEUP, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_STYLE, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_TITLE, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_WIDTH, xxxx|HT32|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|XB10|HT50|XH50 }, + { TidyAttr_XML_LANG, xxxx|xxxx|xxxx|xxxx|X10T|xxxx|xxxx|X10F|xxxx|xxxx|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_XMLNS, xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|XH11|xxxx|HT50|XH50 }, + { TidyAttr_X, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_Y, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_VIEWBOX, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_PRESERVEASPECTRATIO, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_ZOOMANDPAN, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_VERSION, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_BASEPROFILE, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_CONTENTSCRIPTTYPE, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_CONTENTSTYLETYPE, xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50 }, + { TidyAttr_UNKNOWN, 0 }, +}; + const AttrVersion TY_(W3CAttrsFor_TABLE)[] = { { TidyAttr_ACCESSKEY, xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|xxxx|HT50|XH50 }, diff --git a/src/attrdict.h b/src/attrdict.h index d1077e9..1855fa6 100644 --- a/src/attrdict.h +++ b/src/attrdict.h @@ -103,6 +103,7 @@ extern const AttrVersion TY_(W3CAttrsFor_STRONG)[]; extern const AttrVersion TY_(W3CAttrsFor_STYLE)[]; extern const AttrVersion TY_(W3CAttrsFor_SUB)[]; extern const AttrVersion TY_(W3CAttrsFor_SUP)[]; +extern const AttrVersion TY_(W3CAttrsFor_SVG)[]; extern const AttrVersion TY_(W3CAttrsFor_TABLE)[]; extern const AttrVersion TY_(W3CAttrsFor_TBODY)[]; extern const AttrVersion TY_(W3CAttrsFor_TD)[]; diff --git a/src/attrs.c b/src/attrs.c index fa0356e..cbbc8e9 100644 --- a/src/attrs.c +++ b/src/attrs.c @@ -397,6 +397,24 @@ static const Attribute attribute_defs [] = { TidyAttr_ARIA_VALUENOW, "aria-valuenow", CH_PCDATA }, { TidyAttr_ARIA_VALUETEXT, "aria-valuetext", CH_PCDATA }, + { TidyAttr_X, "x", CH_PCDATA }, /* for */ + { TidyAttr_Y, "y", CH_PCDATA }, /* for */ +#if 0 /* with uppercase chars taken directly from W3C; are these case-insensitive everywhere? */ + { TidyAttr_VIEWBOX, "viewBox", VERS_INLINE_SVG, CH_PCDATA }, /* for */ + { TidyAttr_PRESERVEASPECTRATIO, "preserveAspectRatio", VERS_INLINE_SVG, CH_PCDATA }, /* for */ + { TidyAttr_ZOOMANDPAN, "zoomAndPan", VERS_INLINE_SVG, CH_PCDATA }, /* for */ + { TidyAttr_BASEPROFILE, "baseProfile", VERS_INLINE_SVG, CH_PCDATA }, /* for */ + { TidyAttr_CONTENTSCRIPTTYPE, "contentScriptType", VERS_INLINE_SVG, CH_PCDATA }, /* for */ + { TidyAttr_CONTENTSTYLETYPE, "contentStyleType", VERS_INLINE_SVG, CH_PCDATA }, /* for */ +#else + { TidyAttr_VIEWBOX, "viewbox", CH_PCDATA }, /* for */ + { TidyAttr_PRESERVEASPECTRATIO, "preserveaspectratio", CH_PCDATA }, /* for */ + { TidyAttr_ZOOMANDPAN, "zoomandpan", CH_PCDATA }, /* for */ + { TidyAttr_BASEPROFILE, "baseprofile", CH_PCDATA }, /* for */ + { TidyAttr_CONTENTSCRIPTTYPE, "contentscripttype", CH_PCDATA }, /* for */ + { TidyAttr_CONTENTSTYLETYPE, "contentstyletype", CH_PCDATA }, /* for */ +#endif + /* this must be the final entry */ { N_TIDY_ATTRIBS, NULL, NULL } }; diff --git a/src/lexer.h b/src/lexer.h index c63e1a0..cd7897a 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -525,6 +525,7 @@ typedef enum MixedContent, Preformatted, IgnoreMarkup, + OtherNamespace, CdataContent } GetTokenMode; diff --git a/src/parser.c b/src/parser.c index 51fdf45..6011046 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1359,6 +1359,179 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) #endif } +/* [i_a] svg / math */ + +struct MatchingDescendantData +{ + Node *found_node; + Bool *passed_marker_node; + + /* input: */ + TidyTagId matching_tagId; + Node *node_to_find; + Node *marker_node; +}; + +static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate) +{ + struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate; + + if (TagId(node) == cb_data->matching_tagId) + { + /* make sure we match up 'unknown' tags exactly! */ + if (cb_data->matching_tagId != TidyTag_UNKNOWN || + (node->element != NULL && + cb_data->node_to_find != NULL && + cb_data->node_to_find->element != NULL && + 0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element))) + { + cb_data->found_node = node; + return ExitTraversal; + } + } + + if (cb_data->passed_marker_node && node == cb_data->marker_node) + *cb_data->passed_marker_node = yes; + + return VisitParent; +} + +/* +Search the parent chain (from 'parent' upwards up to the root) for a node matching the +given 'node'. + +When the search passes beyond the 'marker_node' (which is assumed to sit in the +parent chain), this will be flagged by setting the boolean referenced by +'is_parent_of_marker' to yes. + +'is_parent_of_marker' and 'marker_node' are optional parameters and may be NULL. +*/ +static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker ) +{ + struct MatchingDescendantData cb_data = { 0 }; + cb_data.matching_tagId = TagId(node); + cb_data.node_to_find = node; + cb_data.marker_node = marker_node; + + assert(node); + + if (is_parent_of_marker) + *is_parent_of_marker = no; + + TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data); + return cb_data.found_node; +} + +/* + Act as a generic XML (sub)tree parser: collect each node and add it to the DOM, without any further validation. + TODO : add schema- or other-hierarchy-definition-based validation of the subtree here... +*/ +void TY_(ParseNamespace)(TidyDocImpl* doc, Node *basenode, GetTokenMode mode) +{ + Lexer* lexer = doc->lexer; + Node *node; + Node *parent = basenode; + uint istackbase; + + /* a la : defer popping elements off the inline stack */ + TY_(DeferDup)( doc ); + istackbase = lexer->istackbase; + lexer->istackbase = lexer->istacksize; + + mode = OtherNamespace; /* Preformatted; IgnoreWhitespace; */ + + while ((node = TY_(GetToken)(doc, mode)) != NULL) + { + /* + fix check to skip action in InsertMisc for regular/empty + nodes, which we don't want here... + + The way we do it here is by checking and processing everything + and only what remains goes into InsertMisc() + */ + + /* is this a close tag? And does it match the current parent node? */ + if (node->type == EndTag) + { + /* + to prevent end tags flowing from one 'alternate namespace' we + check this in two phases: first we check if the tag is a + descendant of the current node, and when it is, we check whether + it is the end tag for a node /within/ or /outside/ the basenode. + */ + Bool outside; + Node *mp = FindMatchingDescendant(parent, node, basenode, &outside); + + if (mp != NULL) + { + /* + when mp != parent as we might expect, + infer end tags until we 'hit' the matched + parent or the basenode + */ + Node *n; + + for (n = parent; + n != NULL && n != basenode->parent && n != mp; + n = n->parent) + { + /* n->implicit = yes; */ + n->closed = yes; + TY_(ReportError)(doc, n->parent, n, MISSING_ENDTAG_BEFORE); + } + assert(outside == no ? n == mp : 1); + assert(outside == yes ? n == basenode->parent : 1); + + if (outside == no) + { + /* EndTag for a node within the basenode subtree. Roll on... */ + n->closed = yes; + TY_(FreeNode)(doc, node); + + node = n; + parent = node->parent; + } + else + { + /* EndTag for a node outside the basenode subtree: let the caller handle that. */ + TY_(UngetToken)( doc ); + node = basenode; + parent = node->parent; + } + + /* when we've arrived at the end-node for the base node, it's quitting time */ + if (node == basenode) + { + lexer->istackbase = istackbase; + assert(basenode->closed == yes); + return; + } + } + else + { + /* unmatched close tag: report an error and discard */ + TY_(ReportError)(doc, parent, node, NON_MATCHING_ENDTAG); + TY_(ReportError)(doc, parent, node, DISCARDING_UNEXPECTED); + assert(parent); + assert(parent->tag != node->tag); + } + } + else if (node->type == StartTag) + { + /* add another child to the current parent */ + TY_(InsertNodeAtEnd)(parent, node); + parent = node; + } + else + { + TY_(InsertNodeAtEnd)(parent, node); + } + } + + TY_(ReportError)(doc, basenode->parent, basenode, MISSING_ENDTAG_FOR); +} + + void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode ) { #if !defined(NDEBUG) && defined(_MSC_VER) diff --git a/src/tags.c b/src/tags.c index 7a39cba..e6b8e0d 100644 --- a/src/tags.c +++ b/src/tags.c @@ -108,6 +108,7 @@ static CheckAttribs CheckHTML; #define VERS_ELEM_STYLE (xxxx|HT32|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50) #define VERS_ELEM_SUB (xxxx|HT32|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50) #define VERS_ELEM_SUP (xxxx|HT32|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50) +#define VERS_ELEM_SVG (xxxx|xxxx|xxxx|H41T|X10T|xxxx|H41F|X10F|xxxx|H41S|X10S|XH11|xxxx|HT50|XH50) #define VERS_ELEM_TABLE (xxxx|HT32|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|XB10|HT50|XH50) #define VERS_ELEM_TBODY (xxxx|xxxx|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|xxxx|HT50|XH50) #define VERS_ELEM_TD (xxxx|HT32|H40T|H41T|X10T|H40F|H41F|X10F|H40S|H41S|X10S|XH11|XB10|HT50|XH50) @@ -247,6 +248,7 @@ static const Dict tag_defs[] = { TidyTag_STYLE, "style", VERS_ELEM_STYLE, &TY_(W3CAttrsFor_STYLE)[0], (CM_HEAD|CM_BLOCK), TY_(ParseScript), NULL }, { TidyTag_SUB, "sub", VERS_ELEM_SUB, &TY_(W3CAttrsFor_SUB)[0], (CM_INLINE), TY_(ParseInline), NULL }, { TidyTag_SUP, "sup", VERS_ELEM_SUP, &TY_(W3CAttrsFor_SUP)[0], (CM_INLINE), TY_(ParseInline), NULL }, + { TidyTag_SVG, "svg", VERS_ELEM_SVG, &TY_(W3CAttrsFor_SVG)[0], (CM_INLINE|CM_BLOCK|CM_MIXED), TY_(ParseNamespace),NULL }, { TidyTag_TABLE, "table", VERS_ELEM_TABLE, &TY_(W3CAttrsFor_TABLE)[0], (CM_BLOCK), TY_(ParseTableTag), CheckTABLE }, { TidyTag_TBODY, "tbody", VERS_ELEM_TBODY, &TY_(W3CAttrsFor_TBODY)[0], (CM_TABLE|CM_ROWGRP|CM_OPT), TY_(ParseRowGroup), NULL }, { TidyTag_TD, "td", VERS_ELEM_TD, &TY_(W3CAttrsFor_TD)[0], (CM_ROW|CM_OPT|CM_NO_INDENT), TY_(ParseBlock), NULL }, @@ -949,6 +951,41 @@ uint TY_(nodeHeaderLevel)( Node* node ) return 0; } +/* [i_a] generic node tree traversal; see also */ +NodeTraversalSignal TY_(TraverseNodeTree)(TidyDocImpl* doc, Node* node, NodeTraversalCallBack *cb, void *propagate ) +{ + while (node) + { + NodeTraversalSignal s = (*cb)(doc, node, propagate); + + if (node->content && (s == ContinueTraversal || s == SkipSiblings)) + { + s = TY_(TraverseNodeTree)(doc, node->content, cb, propagate); + } + + switch (s) + { + case ExitTraversal: + return ExitTraversal; + + case VisitParent: + node = node->parent; + continue; + + case SkipSiblings: + case SkipChildrenAndSiblings: + return ContinueTraversal; + + default: + node = node->next; + break; + } + } + return ContinueTraversal; +} + + + /* * local variables: * mode: c diff --git a/src/tags.h b/src/tags.h index 67f0174..8dc19a0 100644 --- a/src/tags.h +++ b/src/tags.h @@ -111,6 +111,7 @@ Parser TY_(ParseSelect); Parser TY_(ParseOptGroup); Parser TY_(ParseText); Parser TY_(ParseDatalist); +Parser TY_(ParseNamespace); CheckAttribs TY_(CheckAttributes); diff --git a/src/tidy-int.h b/src/tidy-int.h index 6bc2a6f..755a4bf 100755 --- a/src/tidy-int.h +++ b/src/tidy-int.h @@ -121,4 +121,26 @@ TidyOption tidyImplToOption( const TidyOptionImpl* option ); int TY_(DocParseStream)( TidyDocImpl* impl, StreamIn* in ); +/* + [i_a] generic node tree traversal code; used in several spots. + + Define your own callback, which returns one of the NodeTraversalSignal values + to instruct the tree traversal routine TraverseNodeTree() what to do. + + Pass custom data to/from the callback using the 'propagate' reference. + */ +typedef enum +{ + ContinueTraversal, /* visit siblings and children */ + SkipChildren, /* visit siblings of this node; ignore its children */ + SkipSiblings, /* ignore subsequent siblings of this node; ignore their children; traverse */ + SkipChildrenAndSiblings, /* visit siblings of this node; ignore its children */ + VisitParent, /* REVERSE traversal: visit the parent of the current node */ + ExitTraversal /* terminate traversal on the spot */ +} NodeTraversalSignal; + +typedef NodeTraversalSignal NodeTraversalCallBack(TidyDocImpl* doc, Node* node, void *propagate); + +NodeTraversalSignal TY_(TraverseNodeTree)(TidyDocImpl* doc, Node* node, NodeTraversalCallBack *cb, void *propagate); + #endif /* __TIDY_INT_H__ */