diff --git a/include/tidyenum.h b/include/tidyenum.h index 37e5dc7..9aaf43b 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -648,6 +648,7 @@ typedef enum TidyXmlPIs, /**< If set to yes PIs must end with ?> */ TidyXmlSpace, /**< If set to yes adds xml:space attr as needed */ TidyXmlTags, /**< Treat input as XML */ + TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */ N_TIDY_OPTIONS /**< Must be last */ } TidyOptionId; diff --git a/src/attrs.h b/src/attrs.h index e5b0fa9..0192efc 100644 --- a/src/attrs.h +++ b/src/attrs.h @@ -184,6 +184,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc); #define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN ) #define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING ) #define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING ) +#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET ) #define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR ) #define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF ) #define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET ) @@ -385,6 +386,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc); #define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT ) #define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR ) #define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED ) +#define attrGetCHARSET( nod ) TY_(AttrGetById)( nod, TidyAttr_CHARSET ) #define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED ) #define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG ) #define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET ) diff --git a/src/config.c b/src/config.c index f1b62d0..49208e5 100644 --- a/src/config.c +++ b/src/config.c @@ -336,6 +336,7 @@ static const TidyOptionImpl option_defs[] = { TidyXmlPIs, MU, "assume-xml-procins", BL, no, ParseBool, boolPicks }, { TidyXmlSpace, MU, "add-xml-space", BL, no, ParseBool, boolPicks }, { TidyXmlTags, MU, "input-xml", BL, no, ParseBool, boolPicks }, + { TidyMetaCharset, MS, "add-meta-charset", BL, no, ParseBool, boolPicks }, /* 20161004 - Issue #456 */ { N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL } }; diff --git a/src/language_en.h b/src/language_en.h index 3deed58..c0fa41d 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -1508,7 +1508,18 @@ static languageDefinition language_en = { whichPluralForm_en, { "This option specifies if Tidy should use the XML parser rather than the " "error correcting HTML parser. " }, - + {/* Important notes for translators: + - Use only , , , , and +
. + - Entities, tags, attributes, etc., should be enclosed in . + - Option values should be enclosed in . + - It's very important that
be self-closing! + - The strings "Tidy" and "HTML Tidy" are the program name and must not + be translated. */ + TidyMetaCharset, 0, + "This option adds a meta element and sets the charset attribute to the encoding of the document." + "Set this option to 'yes' if you want this." + }, /******************************************** ** TidyConfigCategory enumeration @@ -1772,7 +1783,6 @@ static languageDefinition language_en = { whichPluralForm_en, { "https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md" }, - /******************************************** ** Report Output ** @remark enum source TidyStrings @@ -1886,7 +1896,6 @@ static languageDefinition language_en = { whichPluralForm_en, { { ELEMENT_NOT_EMPTY, 0, "%s element not empty or not closed" }, /* ReportError, ReportAttrError */ { UNEXPECTED_END_OF_FILE, 0, "unexpected end of file %s" }, /* ReportError, ReportAttrError */ { UNEXPECTED_ENDTAG, 0, "unexpected " }, /* ReportError, ReportFatal */ - #if SUPPORT_ACCESSIBILITY_CHECKS diff --git a/src/lexer.c b/src/lexer.c index 238fbfa..c2773dc 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1827,6 +1827,121 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) return node; } +/* Check meta charset + 1. if there is no meta charset, it adds one. + 2. if there is a meta charset, it moves it to the top if HEAD. + 3. if it doesn't match the output encoding, warn about that. + 4. if there are duplicates, discard them. + */ +Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) +{ + AttVal *charsetAttr; + AttVal *contentAttr; + AttVal *httpEquivAttr; + Bool charsetFound = no; + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); + Node *currentNode; + Node *head = TY_(FindHEAD)( doc ); + Node *metaTag; + Node *prevNode; + TidyBuffer buf; + TidyBuffer charsetString; + tmbstr httpEquivAttrValue; + tmbstr lcontent; + tmbstr newValue; + /* We can't do anything we don't have a head or encoding is NULL */ + if( !head || !enc ) + return no; + tidyBufInit(&charsetString); + for (currentNode = head->content; currentNode; currentNode = currentNode->next) + { + if (!nodeIsMETA(currentNode)) + continue; + charsetAttr = attrGetCHARSET(currentNode); + httpEquivAttr = attrGetHTTP_EQUIV(currentNode); + if(!charsetAttr && !httpEquivAttr) + continue; + /* + Meta charset comes in quite a few flavors: + 1. - expected for (X)HTML5. + */ + if (charsetAttr && !httpEquivAttr) + { + // we already found one, so remove the rest. + if(charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)( doc, currentNode ); + currentNode = prevNode; + continue; + } + charsetFound = yes; + // Fix mismatched attribute value + if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) + { + newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); + TY_(tmbstrcpy)( newValue, enc ); + TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); + charsetAttr->value = newValue; + } + // Make sure it's the first element. + if ( currentNode != head->content->next ){ + TY_(RemoveNode)( currentNode ); + TY_(InsertNodeAtStart)( head, currentNode ); + } + continue; + } + /* + 2. + expected for HTML4. This is normally ok - but can clash. + */ + if(httpEquivAttr && !charsetAttr) + { + tidyBufClear(&charsetString); + tidyBufAppend(&charsetString, "charset=", 8); + tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc )); + contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); + httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); + + if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) + continue; + lcontent = TY_(tmbstrtolower)(contentAttr->value); + if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){ + printf("WARN ABOUT CLASH: %s \n", contentAttr->value); + } + } + /* + 3. + This is generally bad. + */ + if(httpEquivAttr && charsetAttr) + { + printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); + } + } + if(charsetFound){ + return yes; + } + metaTag = TY_(InferredTag)(doc, TidyTag_META); + switch(TY_(HTMLVersion)(doc)) + { + case HT50: + case XH50: + TY_(AddAttribute)( doc, metaTag, "charset", enc); + break; + default: + tidyBufInit(&buf); + tidyBufAppend(&buf, "text/html; charset=", 19); + tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc)); + TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp); + tidyBufFree(&buf); + } + TY_(InsertNodeAtStart)( head, metaTag ); + tidyBufFree(&charsetString); + return yes; +} + /* add meta element for Tidy */ Bool TY_(AddGenerator)( TidyDocImpl* doc ) { diff --git a/src/lexer.h b/src/lexer.h index 1d3d9cd..a30e5d8 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -491,6 +491,9 @@ Node* TY_(FindXmlDecl)(TidyDocImpl* doc); /* Returns containing block element, if any */ Node* TY_(FindContainer)( Node* node ); +/* Adds meta element and sets the charset */ +Bool TY_(TidyMetaCharset)( TidyDocImpl* doc ); + /* add meta element for Tidy */ Bool TY_(AddGenerator)( TidyDocImpl* doc ); diff --git a/src/tidylib.c b/src/tidylib.c index fad55e6..811721b 100755 --- a/src/tidylib.c +++ b/src/tidylib.c @@ -1992,6 +1992,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut ); Bool xmlDecl = cfgBool( doc, TidyXmlDecl ); Bool tidyMark = cfgBool( doc, TidyMark ); + Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset); Bool tidyXmlTags = cfgBool( doc, TidyXmlTags ); Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName ); Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis ); @@ -2095,6 +2096,9 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) if (tidyMark ) TY_(AddGenerator)(doc); + + if (tidyMetaCharset) + TY_(TidyMetaCharset)(doc); } /* ensure presence of initial */