diff --git a/include/tidyenum.h b/include/tidyenum.h
index 37e5dc7..9aaf43b 100644
--- a/include/tidyenum.h
+++ b/include/tidyenum.h
@@ -648,6 +648,7 @@ typedef enum
TidyXmlPIs, /**< If set to yes PIs must end with ?> */
TidyXmlSpace, /**< If set to yes adds xml:space attr as needed */
TidyXmlTags, /**< Treat input as XML */
+ TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */
N_TIDY_OPTIONS /**< Must be last */
} TidyOptionId;
diff --git a/src/attrs.h b/src/attrs.h
index e5b0fa9..0192efc 100644
--- a/src/attrs.h
+++ b/src/attrs.h
@@ -184,6 +184,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
#define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN )
#define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING )
#define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING )
+#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
#define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR )
#define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF )
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
@@ -385,6 +386,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
#define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT )
#define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR )
#define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED )
+#define attrGetCHARSET( nod ) TY_(AttrGetById)( nod, TidyAttr_CHARSET )
#define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED )
#define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG )
#define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET )
diff --git a/src/config.c b/src/config.c
index f1b62d0..49208e5 100644
--- a/src/config.c
+++ b/src/config.c
@@ -336,6 +336,7 @@ static const TidyOptionImpl option_defs[] =
{ TidyXmlPIs, MU, "assume-xml-procins", BL, no, ParseBool, boolPicks },
{ TidyXmlSpace, MU, "add-xml-space", BL, no, ParseBool, boolPicks },
{ TidyXmlTags, MU, "input-xml", BL, no, ParseBool, boolPicks },
+ { TidyMetaCharset, MS, "add-meta-charset", BL, no, ParseBool, boolPicks }, /* 20161004 - Issue #456 */
{ N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL }
};
diff --git a/src/language_en.h b/src/language_en.h
index 3deed58..c0fa41d 100644
--- a/src/language_en.h
+++ b/src/language_en.h
@@ -1508,7 +1508,18 @@ static languageDefinition language_en = { whichPluralForm_en, {
"This option specifies if Tidy should use the XML parser rather than the "
"error correcting HTML parser. "
},
-
+ {/* Important notes for translators:
+ - Use only
, , , , and
+
.
+ - Entities, tags, attributes, etc., should be enclosed in
.
+ - Option values should be enclosed in .
+ - It's very important that
be self-closing!
+ - The strings "Tidy" and "HTML Tidy" are the program name and must not
+ be translated. */
+ TidyMetaCharset, 0,
+ "This option adds a meta element and sets the charset attribute to the encoding of the document."
+ "Set this option to 'yes' if you want this."
+ },
/********************************************
** TidyConfigCategory enumeration
@@ -1772,7 +1783,6 @@ static languageDefinition language_en = { whichPluralForm_en, {
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md"
},
-
/********************************************
** Report Output
** @remark enum source TidyStrings
@@ -1886,7 +1896,6 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ ELEMENT_NOT_EMPTY, 0, "%s element not empty or not closed" }, /* ReportError, ReportAttrError */
{ UNEXPECTED_END_OF_FILE, 0, "unexpected end of file %s" }, /* ReportError, ReportAttrError */
{ UNEXPECTED_ENDTAG, 0, "unexpected %s>" }, /* ReportError, ReportFatal */
-
#if SUPPORT_ACCESSIBILITY_CHECKS
diff --git a/src/lexer.c b/src/lexer.c
index 238fbfa..c2773dc 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -1827,6 +1827,121 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
return node;
}
+/* Check meta charset
+ 1. if there is no meta charset, it adds one.
+ 2. if there is a meta charset, it moves it to the top if HEAD.
+ 3. if it doesn't match the output encoding, warn about that.
+ 4. if there are duplicates, discard them.
+ */
+Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
+{
+ AttVal *charsetAttr;
+ AttVal *contentAttr;
+ AttVal *httpEquivAttr;
+ Bool charsetFound = no;
+ ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
+ Node *currentNode;
+ Node *head = TY_(FindHEAD)( doc );
+ Node *metaTag;
+ Node *prevNode;
+ TidyBuffer buf;
+ TidyBuffer charsetString;
+ tmbstr httpEquivAttrValue;
+ tmbstr lcontent;
+ tmbstr newValue;
+ /* We can't do anything we don't have a head or encoding is NULL */
+ if( !head || !enc )
+ return no;
+ tidyBufInit(&charsetString);
+ for (currentNode = head->content; currentNode; currentNode = currentNode->next)
+ {
+ if (!nodeIsMETA(currentNode))
+ continue;
+ charsetAttr = attrGetCHARSET(currentNode);
+ httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
+ if(!charsetAttr && !httpEquivAttr)
+ continue;
+ /*
+ Meta charset comes in quite a few flavors:
+ 1. - expected for (X)HTML5.
+ */
+ if (charsetAttr && !httpEquivAttr)
+ {
+ // we already found one, so remove the rest.
+ if(charsetFound)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)( doc, currentNode );
+ currentNode = prevNode;
+ continue;
+ }
+ charsetFound = yes;
+ // Fix mismatched attribute value
+ if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
+ {
+ newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
+ TY_(tmbstrcpy)( newValue, enc );
+ TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
+ charsetAttr->value = newValue;
+ }
+ // Make sure it's the first element.
+ if ( currentNode != head->content->next ){
+ TY_(RemoveNode)( currentNode );
+ TY_(InsertNodeAtStart)( head, currentNode );
+ }
+ continue;
+ }
+ /*
+ 2.
+ expected for HTML4. This is normally ok - but can clash.
+ */
+ if(httpEquivAttr && !charsetAttr)
+ {
+ tidyBufClear(&charsetString);
+ tidyBufAppend(&charsetString, "charset=", 8);
+ tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
+ contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
+ httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
+
+ if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
+ continue;
+ lcontent = TY_(tmbstrtolower)(contentAttr->value);
+ if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
+ printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
+ }
+ }
+ /*
+ 3.
+ This is generally bad.
+ */
+ if(httpEquivAttr && charsetAttr)
+ {
+ printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
+ }
+ }
+ if(charsetFound){
+ return yes;
+ }
+ metaTag = TY_(InferredTag)(doc, TidyTag_META);
+ switch(TY_(HTMLVersion)(doc))
+ {
+ case HT50:
+ case XH50:
+ TY_(AddAttribute)( doc, metaTag, "charset", enc);
+ break;
+ default:
+ tidyBufInit(&buf);
+ tidyBufAppend(&buf, "text/html; charset=", 19);
+ tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
+ TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp);
+ tidyBufFree(&buf);
+ }
+ TY_(InsertNodeAtStart)( head, metaTag );
+ tidyBufFree(&charsetString);
+ return yes;
+}
+
/* add meta element for Tidy */
Bool TY_(AddGenerator)( TidyDocImpl* doc )
{
diff --git a/src/lexer.h b/src/lexer.h
index 1d3d9cd..a30e5d8 100644
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -491,6 +491,9 @@ Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
/* Returns containing block element, if any */
Node* TY_(FindContainer)( Node* node );
+/* Adds meta element and sets the charset */
+Bool TY_(TidyMetaCharset)( TidyDocImpl* doc );
+
/* add meta element for Tidy */
Bool TY_(AddGenerator)( TidyDocImpl* doc );
diff --git a/src/tidylib.c b/src/tidylib.c
index fad55e6..811721b 100755
--- a/src/tidylib.c
+++ b/src/tidylib.c
@@ -1992,6 +1992,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
Bool tidyMark = cfgBool( doc, TidyMark );
+ Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
@@ -2095,6 +2096,9 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
if (tidyMark )
TY_(AddGenerator)(doc);
+
+ if (tidyMetaCharset)
+ TY_(TidyMetaCharset)(doc);
}
/* ensure presence of initial */