Issue #456 - Merge branch 'meta-charset' of tidy-html5-marco.

This pulls the work done by @marcoscaceres WIP #458 into the issue-456
branch, to complete the new add-meta-charset option.
This commit is contained in:
Geoff McLane 2017-05-13 15:55:13 +02:00
commit 8843199370
7 changed files with 138 additions and 3 deletions

View file

@ -648,6 +648,7 @@ typedef enum
TidyXmlPIs, /**< If set to yes PIs must end with ?> */
TidyXmlSpace, /**< If set to yes adds xml:space attr as needed */
TidyXmlTags, /**< Treat input as XML */
TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */
N_TIDY_OPTIONS /**< Must be last */
} TidyOptionId;

View file

@ -184,6 +184,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
#define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN )
#define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING )
#define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING )
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
#define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR )
#define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF )
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
@ -385,6 +386,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
#define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT )
#define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR )
#define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED )
#define attrGetCHARSET( nod ) TY_(AttrGetById)( nod, TidyAttr_CHARSET )
#define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED )
#define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG )
#define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET )

View file

@ -336,6 +336,7 @@ static const TidyOptionImpl option_defs[] =
{ TidyXmlPIs, MU, "assume-xml-procins", BL, no, ParseBool, boolPicks },
{ TidyXmlSpace, MU, "add-xml-space", BL, no, ParseBool, boolPicks },
{ TidyXmlTags, MU, "input-xml", BL, no, ParseBool, boolPicks },
{ TidyMetaCharset, MS, "add-meta-charset", BL, no, ParseBool, boolPicks }, /* 20161004 - Issue #456 */
{ N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL }
};

View file

@ -1508,7 +1508,18 @@ static languageDefinition language_en = { whichPluralForm_en, {
"This option specifies if Tidy should use the XML parser rather than the "
"error correcting HTML parser. "
},
{/* Important notes for translators:
- Use only <code></code>, <var></var>, <em></em>, <strong></strong>, and
<br/>.
- Entities, tags, attributes, etc., should be enclosed in <code></code>.
- Option values should be enclosed in <var></var>.
- It's very important that <br/> be self-closing!
- The strings "Tidy" and "HTML Tidy" are the program name and must not
be translated. */
TidyMetaCharset, 0,
"This option adds a meta element and sets the charset attribute to the encoding of the document."
"Set this option to 'yes' if you want this."
},
/********************************************
** TidyConfigCategory enumeration
@ -1772,7 +1783,6 @@ static languageDefinition language_en = { whichPluralForm_en, {
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md"
},
/********************************************
** Report Output
** @remark enum source TidyStrings
@ -1886,7 +1896,6 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ ELEMENT_NOT_EMPTY, 0, "%s element not empty or not closed" }, /* ReportError, ReportAttrError */
{ UNEXPECTED_END_OF_FILE, 0, "unexpected end of file %s" }, /* ReportError, ReportAttrError */
{ UNEXPECTED_ENDTAG, 0, "unexpected </%s>" }, /* ReportError, ReportFatal */
#if SUPPORT_ACCESSIBILITY_CHECKS

View file

@ -1827,6 +1827,121 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
return node;
}
/* Check meta charset
1. if there is no meta charset, it adds one.
2. if there is a meta charset, it moves it to the top if HEAD.
3. if it doesn't match the output encoding, warn about that.
4. if there are duplicates, discard them.
*/
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
{
AttVal *charsetAttr;
AttVal *contentAttr;
AttVal *httpEquivAttr;
Bool charsetFound = no;
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
Node *currentNode;
Node *head = TY_(FindHEAD)( doc );
Node *metaTag;
Node *prevNode;
TidyBuffer buf;
TidyBuffer charsetString;
tmbstr httpEquivAttrValue;
tmbstr lcontent;
tmbstr newValue;
/* We can't do anything we don't have a head or encoding is NULL */
if( !head || !enc )
return no;
tidyBufInit(&charsetString);
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
{
if (!nodeIsMETA(currentNode))
continue;
charsetAttr = attrGetCHARSET(currentNode);
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
if(!charsetAttr && !httpEquivAttr)
continue;
/*
Meta charset comes in quite a few flavors:
1. <meta charset=value> - expected for (X)HTML5.
*/
if (charsetAttr && !httpEquivAttr)
{
// we already found one, so remove the rest.
if(charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)( doc, currentNode );
currentNode = prevNode;
continue;
}
charsetFound = yes;
// Fix mismatched attribute value
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
{
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
TY_(tmbstrcpy)( newValue, enc );
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
charsetAttr->value = newValue;
}
// Make sure it's the first element.
if ( currentNode != head->content->next ){
TY_(RemoveNode)( currentNode );
TY_(InsertNodeAtStart)( head, currentNode );
}
continue;
}
/*
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
expected for HTML4. This is normally ok - but can clash.
*/
if(httpEquivAttr && !charsetAttr)
{
tidyBufClear(&charsetString);
tidyBufAppend(&charsetString, "charset=", 8);
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
continue;
lcontent = TY_(tmbstrtolower)(contentAttr->value);
if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
}
}
/*
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
This is generally bad.
*/
if(httpEquivAttr && charsetAttr)
{
printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
}
}
if(charsetFound){
return yes;
}
metaTag = TY_(InferredTag)(doc, TidyTag_META);
switch(TY_(HTMLVersion)(doc))
{
case HT50:
case XH50:
TY_(AddAttribute)( doc, metaTag, "charset", enc);
break;
default:
tidyBufInit(&buf);
tidyBufAppend(&buf, "text/html; charset=", 19);
tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp);
tidyBufFree(&buf);
}
TY_(InsertNodeAtStart)( head, metaTag );
tidyBufFree(&charsetString);
return yes;
}
/* add meta element for Tidy */
Bool TY_(AddGenerator)( TidyDocImpl* doc )
{

View file

@ -491,6 +491,9 @@ Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
/* Returns containing block element, if any */
Node* TY_(FindContainer)( Node* node );
/* Adds meta element and sets the charset */
Bool TY_(TidyMetaCharset)( TidyDocImpl* doc );
/* add meta element for Tidy */
Bool TY_(AddGenerator)( TidyDocImpl* doc );

View file

@ -1992,6 +1992,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
Bool tidyMark = cfgBool( doc, TidyMark );
Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
@ -2095,6 +2096,9 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
if (tidyMark )
TY_(AddGenerator)(doc);
if (tidyMetaCharset)
TY_(TidyMetaCharset)(doc);
}
/* ensure presence of initial <?xml version="1.0"?> */