From 2ade3357a927f5139bca62c6c62d1514f467dfad Mon Sep 17 00:00:00 2001 From: Jim Derry Date: Sat, 13 Feb 2016 11:31:16 +0800 Subject: [PATCH] Phase 2 This is a MUCH SANER approach to what I was trying to do (now that I screwed up enough internals to understand some of them! At this point there are zero exit state reversions, and zero markup reversions! There are still 21 errout reversions; I'll annotate and adjust as necessary. --- src/attrs.c | 47 +++++-- src/attrs.h | 4 + src/language.c | 6 +- src/language_en.h | 25 +++- src/lexer.c | 20 +-- src/message.c | 42 ++++++- src/message.h | 6 +- src/tidylib.c | 314 +++++++++++++++++++++++++++++++--------------- version.txt | 2 +- 9 files changed, 324 insertions(+), 142 deletions(-) diff --git a/src/attrs.c b/src/attrs.c index 983caee..f678245 100644 --- a/src/attrs.c +++ b/src/attrs.c @@ -479,11 +479,12 @@ uint TY_(NodeAttributeVersions)( Node* node, TidyAttrId id ) return VERS_UNKNOWN; } -/* returns true if the element is a W3C defined element */ -/* but the element/attribute combination is not. We're */ -/* only defining as "proprietary" items that are not in */ -/* the element's AttrVersion structure. */ -static Bool AttributeIsProprietary(Node* node, AttVal* attval) +/* returns true if the element is a W3C defined element + * but the element/attribute combination is not. We're + * only defining as "proprietary" items that are not in + * the element's AttrVersion structure. + */ +Bool TY_(AttributeIsProprietary)(Node* node, AttVal* attval) { if (!node || !attval) return no; @@ -500,6 +501,34 @@ static Bool AttributeIsProprietary(Node* node, AttVal* attval) return yes; } +/* returns true if the element is a W3C defined element + * but the element/attribute combination is not. We're + * considering it a mismatch if the document version + * does not allow the attribute as called out in its + * AttrVersion structure. + */ +Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc) +{ + uint doctype; + + if (!node || !attval) + return no; + + if (!node->tag) + return no; + + if (!(node->tag->versions & VERS_ALL)) + return no; + + doctype = doc->lexer->versionEmitted == 0 ? doc->lexer->doctype : doc->lexer->versionEmitted; + + if (AttributeVersions(node, attval) & doctype) + return no; + + return yes; +} + + /* used by CheckColor() */ struct _colors { @@ -1358,14 +1387,6 @@ const Attribute* TY_(CheckAttribute)( TidyDocImpl* doc, Node *node, AttVal *attv attribute->attrchk( doc, node, attval ); } - if (AttributeIsProprietary(node, attval)) - { - TY_(ReportAttrError)(doc, node, attval, PROPRIETARY_ATTRIBUTE); - - if (cfgBool(doc, TidyDropPropAttrs)) - TY_(RemoveAttribute)( doc, node, attval ); - } - return attribute; } diff --git a/src/attrs.h b/src/attrs.h index 2a70faf..e5b0fa9 100644 --- a/src/attrs.h +++ b/src/attrs.h @@ -147,6 +147,10 @@ AttVal* TY_(AttrGetById)( Node* node, TidyAttrId id ); uint TY_(NodeAttributeVersions)( Node* node, TidyAttrId id ); +Bool TY_(AttributeIsProprietary)(Node* node, AttVal* attval); +Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc); + + /* 0 == TidyAttr_UNKNOWN */ #define AttrId(av) ((av) && (av)->dict ? (av)->dict->id : TidyAttr_UNKNOWN) #define AttrIsId(av, atid) ((av) && (av)->dict && ((av)->dict->id == atid)) diff --git a/src/language.c b/src/language.c index 6d60389..1959409 100644 --- a/src/language.c +++ b/src/language.c @@ -262,6 +262,8 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = { { "CANT_BE_NESTED", CANT_BE_NESTED }, { "OBSOLETE_ELEMENT", OBSOLETE_ELEMENT }, { "PROPRIETARY_ELEMENT", PROPRIETARY_ELEMENT }, + { "ELEMENT_VERS_MISMATCH_ERROR", ELEMENT_VERS_MISMATCH_ERROR }, + { "ELEMENT_VERS_MISMATCH_WARN", ELEMENT_VERS_MISMATCH_WARN }, { "UNKNOWN_ELEMENT", UNKNOWN_ELEMENT }, { "TRIM_EMPTY_ELEMENT", TRIM_EMPTY_ELEMENT }, { "COERCE_TO_ENDTAG", COERCE_TO_ENDTAG }, @@ -298,6 +300,8 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = { { "BAD_ATTRIBUTE_VALUE", BAD_ATTRIBUTE_VALUE }, { "UNEXPECTED_GT", UNEXPECTED_GT }, { "PROPRIETARY_ATTRIBUTE", PROPRIETARY_ATTRIBUTE }, + { "MISMATCHED_ATTRIBUTE_ERROR", MISMATCHED_ATTRIBUTE_ERROR }, + { "MISMATCHED_ATTRIBUTE_WARN", MISMATCHED_ATTRIBUTE_WARN }, { "PROPRIETARY_ATTR_VALUE", PROPRIETARY_ATTR_VALUE }, { "REPEATED_ATTRIBUTE", REPEATED_ATTRIBUTE }, { "MISSING_IMAGEMAP", MISSING_IMAGEMAP }, @@ -322,8 +326,6 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = { { "MISSING_ATTRIBUTE", MISSING_ATTRIBUTE }, { "WHITE_IN_URI", WHITE_IN_URI }, { "REMOVED_HTML5", REMOVED_HTML5 }, - { "BAD_BODY_HTML5", BAD_BODY_HTML5 }, - { "BAD_ALIGN_HTML5", BAD_ALIGN_HTML5 }, { "BAD_SUMMARY_HTML5", BAD_SUMMARY_HTML5 }, { "PREVIOUS_LOCATION", PREVIOUS_LOCATION }, { "VENDOR_SPECIFIC_CHARS", VENDOR_SPECIFIC_CHARS }, diff --git a/src/language_en.h b/src/language_en.h index fbfb1a8..a06d58c 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -350,6 +350,8 @@ static languageDefinition language_en = { whichPluralForm_en, { { MISSING_ATTR_VALUE, 0, "%s attribute \"%s\" lacks value" }, /* Warning in CheckUrl, Error otherwise */ { UNKNOWN_ATTRIBUTE, 0, "%s unknown attribute \"%s\"" }, /* Error */ { PROPRIETARY_ATTRIBUTE, 0, "%s proprietary attribute \"%s\"" }, /* Error */ + { MISMATCHED_ATTRIBUTE_ERROR, 0, "%s attribute \"%s\" not allowed for %s" }, /* Error */ + { MISMATCHED_ATTRIBUTE_WARN, 0, "%s attribute \"%s\" not allowed for %s" }, /* Warning */ { JOINING_ATTRIBUTE, 0, "%s joining values of repeated attribute \"%s\"" }, /* Error */ { XML_ATTRIBUTE_VALUE, 0, "%s has XML attribute \"%s\"" }, /* Error (but deprecated) */ @@ -392,8 +394,6 @@ static languageDefinition language_en = { whichPluralForm_en, { { OBSOLETE_ELEMENT, 0, "replacing obsolete element %s with %s" }, /* Warning */ { COERCE_TO_ENDTAG_WARN, 0, "<%s> is probably intended as " }, /* Warning */ { REMOVED_HTML5, 0, "%s element removed from HTML5" }, /* Warning */ - { BAD_BODY_HTML5, 0, "Found attribute on body that is obsolete in HTML5. Use CSS" }, /* Warning */ - { BAD_ALIGN_HTML5, 0, "The align attribute on the %s element is obsolete. Use CSS" }, /* Warning */ { BAD_SUMMARY_HTML5, 0, "The summary attribute on the %s element is obsolete in HTML5" }, /* Warning */ /* ReportNotice */ @@ -415,6 +415,8 @@ static languageDefinition language_en = { whichPluralForm_en, { { INSERTING_TAG, 0, "inserting implicit <%s>" }, /* Error */ { CANT_BE_NESTED, 0, "%s can't be nested" }, /* Error */ { PROPRIETARY_ELEMENT, 0, "%s is not approved by W3C" }, /* Error */ + { ELEMENT_VERS_MISMATCH_ERROR, 0, "%s element not available in %s" }, /* Error */ + { ELEMENT_VERS_MISMATCH_WARN, 0, "%s element not available in %s" }, /* Warning */ { ILLEGAL_NESTING, 0, "%s shouldn't be nested" }, /* Error */ { NOFRAMES_CONTENT, 0, "%s not inside 'noframes' element" }, /* Error */ { UNEXPECTED_END_OF_FILE, 0, "unexpected end of file %s" }, /* Error */ @@ -745,7 +747,9 @@ static languageDefinition language_en = { whichPluralForm_en, { - The strings "Tidy" and "HTML Tidy" are the program name and must not be translated. */ TidyDropPropAttrs, 0, "This option specifies if Tidy should strip out proprietary attributes, " - "such as Microsoft data binding attributes. " + "such as Microsoft data binding attributes. Additionally attributes " + "that aren't permitted in the output version of HTML will be dropped " + "if used with strict-tags-attributes. " }, {/* Please use _only_ , , , and
. It's very important that
be self-closing in this manner! @@ -1574,6 +1578,21 @@ static languageDefinition language_en = { whichPluralForm_en, { "This option specifies that Tidy should skip nested tags when parsing " "script and style data. " }, + {/* Please use _only_ , , , and
. + It's very important that
be self-closing in this manner! + - The strings "Tidy" and "HTML Tidy" are the program name and must not be translated. */ + TidyStrictTagsAttr, 0, + "This options ensures that tags and attributes are applicable for the " + "version of HTML that Tidy outputs. When set to yes (the " + "default) and the output document type is a strict doctype, then Tidy " + "will report errors. If the output document type is a loose or " + "transitional doctype, then Tidy will report warnings. " + "
" + "Additionally if drop-proprietary-attributes is enabled, " + "then not applicable attributes will be dropped, too. " + "
" + "When set to no, these checks are not performed. " + }, /******************************************************** ** Console Application diff --git a/src/lexer.c b/src/lexer.c index 24b2343..00193a8 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -2709,26 +2709,8 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode ) { Node* curr = lexer->token; TY_(ConstrainVersion)( doc, curr->tag->versions ); - - if ( curr->tag->versions & VERS_PROPRIETARY ) - { - if ( !cfgBool(doc, TidyMakeClean) || - ( !nodeIsNOBR(curr) && !nodeIsWBR(curr) ) ) - { - TY_(ReportError)(doc, NULL, curr, PROPRIETARY_ELEMENT ); - - if ( nodeIsLAYER(curr) ) - doc->badLayout |= USING_LAYER; - else if ( nodeIsSPACER(curr) ) - doc->badLayout |= USING_SPACER; - else if ( nodeIsNOBR(curr) ) - doc->badLayout |= USING_NOBR; - } - } - - TY_(RepairDuplicateAttributes)( doc, curr, no ); - } else TY_(RepairDuplicateAttributes)( doc, lexer->token, yes ); + #ifdef TIDY_STORE_ORIGINAL_TEXT StoreOriginalTextInToken(doc, lexer->token, 0); #endif diff --git a/src/message.c b/src/message.c index be82246..2328515 100755 --- a/src/message.c +++ b/src/message.c @@ -525,6 +525,8 @@ void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code) char const *name = "NULL", *value = "NULL"; char tagdesc[64]; ctmbstr fmt = tidyLocalizedString(code); + uint version; + ctmbstr extra_string; assert( fmt != NULL ); @@ -549,6 +551,22 @@ void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code) messageNode(doc, TidyWarning, code, node, fmt, tagdesc, name); break; + case MISMATCHED_ATTRIBUTE_WARN: + version = doc->lexer->versionEmitted == 0 ? doc->lexer->doctype : doc->lexer->versionEmitted; + extra_string = TY_(HTMLVersionNameFromCode)(version, 0); + if (!extra_string) + extra_string = tidyLocalizedString(STRING_HTML_PROPRIETARY); + messageNode(doc, TidyWarning, code, node, fmt, tagdesc, name, extra_string); + break; + + case MISMATCHED_ATTRIBUTE_ERROR: + version = doc->lexer->versionEmitted == 0 ? doc->lexer->doctype : doc->lexer->versionEmitted; + extra_string = TY_(HTMLVersionNameFromCode)(version, 0); + if (!extra_string) + extra_string = tidyLocalizedString(STRING_HTML_PROPRIETARY); + messageNode(doc, TidyError, code, node, fmt, tagdesc, name, extra_string); + break; + case BAD_ATTRIBUTE_VALUE: case BAD_ATTRIBUTE_VALUE_REPLACED: case INVALID_ATTRIBUTE: @@ -665,8 +683,6 @@ void TY_(ReportWarning)(TidyDocImpl* doc, Node *element, Node *node, uint code) case NESTED_EMPHASIS: case REMOVED_HTML5: - case BAD_BODY_HTML5: - case BAD_ALIGN_HTML5: case BAD_SUMMARY_HTML5: messageNode(doc, TidyWarning, code, rpt, fmt, nodedesc); break; @@ -707,6 +723,8 @@ void TY_(ReportError)(TidyDocImpl* doc, Node *element, Node *node, uint code) char elemdesc[ 256 ] = {0}; Node* rpt = ( element ? element : node ); ctmbstr fmt = tidyLocalizedString(code); + uint versionEmitted, declared, version; + ctmbstr extra_string = NULL; assert( fmt != NULL ); @@ -729,6 +747,26 @@ void TY_(ReportError)(TidyDocImpl* doc, Node *element, Node *node, uint code) messageNode(doc, TidyWarning, code, node, fmt, nodedesc); break; + case ELEMENT_VERS_MISMATCH_WARN: + versionEmitted = doc->lexer->versionEmitted; + declared = doc->lexer->doctype; + version = versionEmitted == 0 ? declared : versionEmitted; + extra_string = TY_(HTMLVersionNameFromCode)(version, 0); + if (!extra_string) + extra_string = tidyLocalizedString(STRING_HTML_PROPRIETARY); + messageNode(doc, TidyWarning, code, node, fmt, nodedesc, extra_string); + break; + + case ELEMENT_VERS_MISMATCH_ERROR: + versionEmitted = doc->lexer->versionEmitted; + declared = doc->lexer->doctype; + version = versionEmitted == 0 ? declared : versionEmitted; + extra_string = TY_(HTMLVersionNameFromCode)(version, 0); + if (!extra_string) + extra_string = tidyLocalizedString(STRING_HTML_PROPRIETARY); + messageNode(doc, TidyError, code, node, fmt, nodedesc, extra_string); + break; + case MISSING_TITLE_ELEMENT: case INCONSISTENT_VERSION: case MALFORMED_DOCTYPE: diff --git a/src/message.h b/src/message.h index a519766..7c49499 100644 --- a/src/message.h +++ b/src/message.h @@ -98,6 +98,8 @@ typedef enum { CANT_BE_NESTED, OBSOLETE_ELEMENT, PROPRIETARY_ELEMENT, + ELEMENT_VERS_MISMATCH_ERROR, + ELEMENT_VERS_MISMATCH_WARN, UNKNOWN_ELEMENT, TRIM_EMPTY_ELEMENT, COERCE_TO_ENDTAG, @@ -137,6 +139,8 @@ typedef enum { BAD_ATTRIBUTE_VALUE, UNEXPECTED_GT, PROPRIETARY_ATTRIBUTE, + MISMATCHED_ATTRIBUTE_ERROR, + MISMATCHED_ATTRIBUTE_WARN, PROPRIETARY_ATTR_VALUE, REPEATED_ATTRIBUTE, MISSING_IMAGEMAP, @@ -168,8 +172,6 @@ typedef enum { WHITE_IN_URI, REMOVED_HTML5, /* this element removed from HTML5 */ - BAD_BODY_HTML5, /* attr on body removed from HTML5 */ - BAD_ALIGN_HTML5, /* use of align attr removed from HTML5 */ BAD_SUMMARY_HTML5, /* use of summary attr removed from HTML5 */ PREVIOUS_LOCATION, /* last */ diff --git a/src/tidylib.c b/src/tidylib.c index 7e1e4bf..0dbdf05 100755 --- a/src/tidylib.c +++ b/src/tidylib.c @@ -1300,9 +1300,9 @@ void tidyDocReportDoctype( TidyDocImpl* doc ) } -/* ###################################################################################### - HTML5 STUFF - */ +/***************************************************************************** + * HTML5 STUFF + *****************************************************************************/ #if !defined(NDEBUG) && defined(_MSC_VER) extern void show_not_html5(void); /* ----------------------------- @@ -1358,19 +1358,19 @@ Bool inRemovedInfo( uint tid ) return no; } -static Bool BadBody5( Node* node ) -{ - if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) || - TY_(AttrGetById)(node, TidyAttr_BGCOLOR) || - TY_(AttrGetById)(node, TidyAttr_TEXT) || - TY_(AttrGetById)(node, TidyAttr_LINK) || - TY_(AttrGetById)(node, TidyAttr_VLINK) || - TY_(AttrGetById)(node, TidyAttr_ALINK)) - { - return yes; - } - return no; -} +/* Things that should not be in an HTML5 body. This is special for CheckHTML5(), + and we might just want to remove CheckHTML5()'s output altogether and count + on the default --strict-tags-attributes. + */ +static BadBody5Attribs[] = { + TidyAttr_BACKGROUND, + TidyAttr_BGCOLOR, + TidyAttr_TEXT, + TidyAttr_LINK, + TidyAttr_VLINK, + TidyAttr_ALINK, + TidyAttr_UNKNOWN /* Must be last! */ +}; static Bool nodeHasAlignAttr( Node *node ) { @@ -1383,73 +1383,94 @@ static Bool nodeHasAlignAttr( Node *node ) return no; } -/* see http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#obsolete */ - +/* + * Perform special checks for HTML, even when we're not using the default + * option `--strict-tags-attributes yes`. This will ensure that HTML5 warning + * and error output is given regardless of the new option, and ensure that + * cleanup takes place. This provides mostly consistent Tidy behavior even with + * the introduction of this new option. Note that strings have changed, though, + * in order to maintain consistency with the `--strict-tags-attributes` + * messages. + * + * See also: http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#obsolete + */ void TY_(CheckHTML5)( TidyDocImpl* doc, Node* node ) { - /* Lexer* lexer = doc->lexer; */ Bool clean = cfgBool( doc, TidyMakeClean ); + Bool already_strict = cfgBool( doc, TidyStrictTagsAttr ); Node* body = TY_(FindBody)( doc ); Bool warn = yes; /* should this be a warning, error, or report??? */ + AttVal* attr = NULL; + int i = 0; #if !defined(NDEBUG) && defined(_MSC_VER) -// list_not_html5(); + // list_not_html5(); #endif while (node) { if ( nodeHasAlignAttr( node ) ) { - /*\ - * Is this for ALL elements that accept an 'align' attribute, or should - * this be a sub-set test - \*/ - TY_(ReportWarning)(doc, node, node, BAD_ALIGN_HTML5); + /* @todo: Is this for ALL elements that accept an 'align' attribute, + * or should this be a sub-set test? + */ + + /* We will only emit this message if `--strict-tags-attributes==no`; + * otherwise if yes this message will be output during later + * checking. + */ + if ( !already_strict ) + TY_(ReportAttrError)(doc, node, TY_(AttrGetById)(node, TidyAttr_ALIGN), MISMATCHED_ATTRIBUTE_WARN); } if ( node == body ) { - if ( BadBody5(body) ) { - /* perhaps need a new/different warning for this, like - * The background 'attribute" on the body element is obsolete. Use CSS instead. - * but how to pass an attribute name to be embedded in the message. - \*/ - TY_(ReportWarning)(doc, node, body, BAD_BODY_HTML5); + i = 0; + /* We will only emit these messages if `--strict-tags-attributes==no`; + * otherwise if yes these messages will be output during later + * checking. + */ + if ( !already_strict ) { + while ( BadBody5Attribs[i] != TidyAttr_UNKNOWN ) { + attr = TY_(AttrGetById)(node, BadBody5Attribs[i]); + if ( attr ) + TY_(ReportAttrError)(doc, node, attr , MISMATCHED_ATTRIBUTE_WARN); + i++; + } } } else if ( nodeIsACRONYM(node) ) { if (clean) { - /* replace with 'abbr' with warning to that effect - * maybe should use static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid ) + /* Replace with 'abbr' with warning to that effect. + * Maybe should use static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid ) */ TY_(CoerceNode)(doc, node, TidyTag_ABBR, warn, no); } else { - /* sadly, this stops writing of the tidied document, unless 'forced' - TY_(ReportError)(doc, node, node, REMOVED_HTML5); - so go back to a 'warning' for now... - */ - TY_(ReportWarning)(doc, node, node, REMOVED_HTML5); + if ( !already_strict ) + TY_(ReportWarning)(doc, node, node, REMOVED_HTML5); } - } else + } else if ( nodeIsAPPLET(node) ) { if (clean) { - /* replace with 'object' with warning to that effect + /* replace with 'object' with warning to that effect * maybe should use static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid ) */ TY_(CoerceNode)(doc, node, TidyTag_OBJECT, warn, no); } else { - TY_(ReportWarning)(doc, node, node, REMOVED_HTML5); + if ( !already_strict ) + TY_(ReportWarning)(doc, node, node, REMOVED_HTML5); } } else if ( nodeIsBASEFONT(node) ) { - /*\ - * basefont: CSS equivalen 'font-size', 'font-family' and 'color' on body or class on each subsequent element - * Difficult - If it is the first body element, then could consider adding that - * to the as a whole, else could perhaps apply it to all subsequent element. - * But also in consideration is the fact that it was NOT supported in many browsers - * For now just report a warning - \*/ + /* basefont: CSS equivalent 'font-size', 'font-family' and 'color' + * on body or class on each subsequent element. + * Difficult - If it is the first body element, then could consider + * adding that to the as a whole, else could perhaps apply it + * to all subsequent elements. But also in consideration is the fact + * that it was NOT supported in many browsers. + * - For now just report a warning + */ + if ( !already_strict ) TY_(ReportWarning)(doc, node, node, REMOVED_HTML5); } else if ( nodeIsBIG(node) ) { - /*\ - * big: CSS equivalent 'font-size:larger' - * so could replace the ... with + /* big: CSS equivalent 'font-size:larger' + * so could replace the ... with * ... * then replace with * Need to think about that... @@ -1461,94 +1482,185 @@ void TY_(CheckHTML5)( TidyDocImpl* doc, Node* node ) * Also maybe need a specific message like * Element '%s' replaced with 'span' with a 'font-size: larger style attribute * maybe should use static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid ) - * - \*/ + */ if (clean) { TY_(AddStyleProperty)( doc, node, "font-size: larger" ); TY_(CoerceNode)(doc, node, TidyTag_SPAN, warn, no); } else { - TY_(ReportWarning)(doc, node, node, REMOVED_HTML5); + if ( !already_strict ) + TY_(ReportWarning)(doc, node, node, REMOVED_HTML5); } } else if ( nodeIsCENTER(node) ) { - /*\ - * center: CSS equivalent 'text-align:center' - * and 'margin-left:auto; margin-right:auto' on descendant blocks - * Tidy already handles this if 'clean' by SILENTLY generating the