Issue #456 - Merge branch 'meta-charset' of tidy-html5-marco.
This pulls the work done by @marcoscaceres WIP #458 into the issue-456 branch, to complete the new add-meta-charset option.
This commit is contained in:
commit
8843199370
|
@ -648,6 +648,7 @@ typedef enum
|
||||||
TidyXmlPIs, /**< If set to yes PIs must end with ?> */
|
TidyXmlPIs, /**< If set to yes PIs must end with ?> */
|
||||||
TidyXmlSpace, /**< If set to yes adds xml:space attr as needed */
|
TidyXmlSpace, /**< If set to yes adds xml:space attr as needed */
|
||||||
TidyXmlTags, /**< Treat input as XML */
|
TidyXmlTags, /**< Treat input as XML */
|
||||||
|
TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */
|
||||||
N_TIDY_OPTIONS /**< Must be last */
|
N_TIDY_OPTIONS /**< Must be last */
|
||||||
} TidyOptionId;
|
} TidyOptionId;
|
||||||
|
|
||||||
|
|
|
@ -184,6 +184,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
|
||||||
#define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN )
|
#define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN )
|
||||||
#define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING )
|
#define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING )
|
||||||
#define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING )
|
#define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING )
|
||||||
|
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
|
||||||
#define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR )
|
#define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR )
|
||||||
#define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF )
|
#define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF )
|
||||||
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
|
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
|
||||||
|
@ -385,6 +386,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
|
||||||
#define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT )
|
#define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT )
|
||||||
#define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR )
|
#define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR )
|
||||||
#define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED )
|
#define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED )
|
||||||
|
#define attrGetCHARSET( nod ) TY_(AttrGetById)( nod, TidyAttr_CHARSET )
|
||||||
#define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED )
|
#define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED )
|
||||||
#define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG )
|
#define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG )
|
||||||
#define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET )
|
#define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET )
|
||||||
|
|
|
@ -336,6 +336,7 @@ static const TidyOptionImpl option_defs[] =
|
||||||
{ TidyXmlPIs, MU, "assume-xml-procins", BL, no, ParseBool, boolPicks },
|
{ TidyXmlPIs, MU, "assume-xml-procins", BL, no, ParseBool, boolPicks },
|
||||||
{ TidyXmlSpace, MU, "add-xml-space", BL, no, ParseBool, boolPicks },
|
{ TidyXmlSpace, MU, "add-xml-space", BL, no, ParseBool, boolPicks },
|
||||||
{ TidyXmlTags, MU, "input-xml", BL, no, ParseBool, boolPicks },
|
{ TidyXmlTags, MU, "input-xml", BL, no, ParseBool, boolPicks },
|
||||||
|
{ TidyMetaCharset, MS, "add-meta-charset", BL, no, ParseBool, boolPicks }, /* 20161004 - Issue #456 */
|
||||||
{ N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL }
|
{ N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1508,7 +1508,18 @@ static languageDefinition language_en = { whichPluralForm_en, {
|
||||||
"This option specifies if Tidy should use the XML parser rather than the "
|
"This option specifies if Tidy should use the XML parser rather than the "
|
||||||
"error correcting HTML parser. "
|
"error correcting HTML parser. "
|
||||||
},
|
},
|
||||||
|
{/* Important notes for translators:
|
||||||
|
- Use only <code></code>, <var></var>, <em></em>, <strong></strong>, and
|
||||||
|
<br/>.
|
||||||
|
- Entities, tags, attributes, etc., should be enclosed in <code></code>.
|
||||||
|
- Option values should be enclosed in <var></var>.
|
||||||
|
- It's very important that <br/> be self-closing!
|
||||||
|
- The strings "Tidy" and "HTML Tidy" are the program name and must not
|
||||||
|
be translated. */
|
||||||
|
TidyMetaCharset, 0,
|
||||||
|
"This option adds a meta element and sets the charset attribute to the encoding of the document."
|
||||||
|
"Set this option to 'yes' if you want this."
|
||||||
|
},
|
||||||
|
|
||||||
/********************************************
|
/********************************************
|
||||||
** TidyConfigCategory enumeration
|
** TidyConfigCategory enumeration
|
||||||
|
@ -1772,7 +1783,6 @@ static languageDefinition language_en = { whichPluralForm_en, {
|
||||||
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md"
|
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
||||||
/********************************************
|
/********************************************
|
||||||
** Report Output
|
** Report Output
|
||||||
** @remark enum source TidyStrings
|
** @remark enum source TidyStrings
|
||||||
|
@ -1886,7 +1896,6 @@ static languageDefinition language_en = { whichPluralForm_en, {
|
||||||
{ ELEMENT_NOT_EMPTY, 0, "%s element not empty or not closed" }, /* ReportError, ReportAttrError */
|
{ ELEMENT_NOT_EMPTY, 0, "%s element not empty or not closed" }, /* ReportError, ReportAttrError */
|
||||||
{ UNEXPECTED_END_OF_FILE, 0, "unexpected end of file %s" }, /* ReportError, ReportAttrError */
|
{ UNEXPECTED_END_OF_FILE, 0, "unexpected end of file %s" }, /* ReportError, ReportAttrError */
|
||||||
{ UNEXPECTED_ENDTAG, 0, "unexpected </%s>" }, /* ReportError, ReportFatal */
|
{ UNEXPECTED_ENDTAG, 0, "unexpected </%s>" }, /* ReportError, ReportFatal */
|
||||||
|
|
||||||
|
|
||||||
#if SUPPORT_ACCESSIBILITY_CHECKS
|
#if SUPPORT_ACCESSIBILITY_CHECKS
|
||||||
|
|
||||||
|
|
115
src/lexer.c
115
src/lexer.c
|
@ -1827,6 +1827,121 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Check meta charset
|
||||||
|
1. if there is no meta charset, it adds one.
|
||||||
|
2. if there is a meta charset, it moves it to the top if HEAD.
|
||||||
|
3. if it doesn't match the output encoding, warn about that.
|
||||||
|
4. if there are duplicates, discard them.
|
||||||
|
*/
|
||||||
|
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||||
|
{
|
||||||
|
AttVal *charsetAttr;
|
||||||
|
AttVal *contentAttr;
|
||||||
|
AttVal *httpEquivAttr;
|
||||||
|
Bool charsetFound = no;
|
||||||
|
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
|
||||||
|
Node *currentNode;
|
||||||
|
Node *head = TY_(FindHEAD)( doc );
|
||||||
|
Node *metaTag;
|
||||||
|
Node *prevNode;
|
||||||
|
TidyBuffer buf;
|
||||||
|
TidyBuffer charsetString;
|
||||||
|
tmbstr httpEquivAttrValue;
|
||||||
|
tmbstr lcontent;
|
||||||
|
tmbstr newValue;
|
||||||
|
/* We can't do anything we don't have a head or encoding is NULL */
|
||||||
|
if( !head || !enc )
|
||||||
|
return no;
|
||||||
|
tidyBufInit(&charsetString);
|
||||||
|
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
|
||||||
|
{
|
||||||
|
if (!nodeIsMETA(currentNode))
|
||||||
|
continue;
|
||||||
|
charsetAttr = attrGetCHARSET(currentNode);
|
||||||
|
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
|
||||||
|
if(!charsetAttr && !httpEquivAttr)
|
||||||
|
continue;
|
||||||
|
/*
|
||||||
|
Meta charset comes in quite a few flavors:
|
||||||
|
1. <meta charset=value> - expected for (X)HTML5.
|
||||||
|
*/
|
||||||
|
if (charsetAttr && !httpEquivAttr)
|
||||||
|
{
|
||||||
|
// we already found one, so remove the rest.
|
||||||
|
if(charsetFound)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)( doc, currentNode );
|
||||||
|
currentNode = prevNode;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
charsetFound = yes;
|
||||||
|
// Fix mismatched attribute value
|
||||||
|
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
|
||||||
|
{
|
||||||
|
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
|
||||||
|
TY_(tmbstrcpy)( newValue, enc );
|
||||||
|
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
||||||
|
charsetAttr->value = newValue;
|
||||||
|
}
|
||||||
|
// Make sure it's the first element.
|
||||||
|
if ( currentNode != head->content->next ){
|
||||||
|
TY_(RemoveNode)( currentNode );
|
||||||
|
TY_(InsertNodeAtStart)( head, currentNode );
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||||
|
expected for HTML4. This is normally ok - but can clash.
|
||||||
|
*/
|
||||||
|
if(httpEquivAttr && !charsetAttr)
|
||||||
|
{
|
||||||
|
tidyBufClear(&charsetString);
|
||||||
|
tidyBufAppend(&charsetString, "charset=", 8);
|
||||||
|
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
|
||||||
|
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
|
||||||
|
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
||||||
|
|
||||||
|
if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
||||||
|
continue;
|
||||||
|
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
||||||
|
if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
|
||||||
|
printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
||||||
|
This is generally bad.
|
||||||
|
*/
|
||||||
|
if(httpEquivAttr && charsetAttr)
|
||||||
|
{
|
||||||
|
printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(charsetFound){
|
||||||
|
return yes;
|
||||||
|
}
|
||||||
|
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
||||||
|
switch(TY_(HTMLVersion)(doc))
|
||||||
|
{
|
||||||
|
case HT50:
|
||||||
|
case XH50:
|
||||||
|
TY_(AddAttribute)( doc, metaTag, "charset", enc);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
tidyBufInit(&buf);
|
||||||
|
tidyBufAppend(&buf, "text/html; charset=", 19);
|
||||||
|
tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
|
||||||
|
TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp);
|
||||||
|
tidyBufFree(&buf);
|
||||||
|
}
|
||||||
|
TY_(InsertNodeAtStart)( head, metaTag );
|
||||||
|
tidyBufFree(&charsetString);
|
||||||
|
return yes;
|
||||||
|
}
|
||||||
|
|
||||||
/* add meta element for Tidy */
|
/* add meta element for Tidy */
|
||||||
Bool TY_(AddGenerator)( TidyDocImpl* doc )
|
Bool TY_(AddGenerator)( TidyDocImpl* doc )
|
||||||
{
|
{
|
||||||
|
|
|
@ -491,6 +491,9 @@ Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
|
||||||
/* Returns containing block element, if any */
|
/* Returns containing block element, if any */
|
||||||
Node* TY_(FindContainer)( Node* node );
|
Node* TY_(FindContainer)( Node* node );
|
||||||
|
|
||||||
|
/* Adds meta element and sets the charset */
|
||||||
|
Bool TY_(TidyMetaCharset)( TidyDocImpl* doc );
|
||||||
|
|
||||||
/* add meta element for Tidy */
|
/* add meta element for Tidy */
|
||||||
Bool TY_(AddGenerator)( TidyDocImpl* doc );
|
Bool TY_(AddGenerator)( TidyDocImpl* doc );
|
||||||
|
|
||||||
|
|
|
@ -1992,6 +1992,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
||||||
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
|
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
|
||||||
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
|
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
|
||||||
Bool tidyMark = cfgBool( doc, TidyMark );
|
Bool tidyMark = cfgBool( doc, TidyMark );
|
||||||
|
Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
|
||||||
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
|
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
|
||||||
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
|
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
|
||||||
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
|
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
|
||||||
|
@ -2095,6 +2096,9 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
||||||
|
|
||||||
if (tidyMark )
|
if (tidyMark )
|
||||||
TY_(AddGenerator)(doc);
|
TY_(AddGenerator)(doc);
|
||||||
|
|
||||||
|
if (tidyMetaCharset)
|
||||||
|
TY_(TidyMetaCharset)(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ensure presence of initial <?xml version="1.0"?> */
|
/* ensure presence of initial <?xml version="1.0"?> */
|
||||||
|
|
Loading…
Reference in a new issue