Issue #456 - Merge branch 'meta-charset' of tidy-html5-marco.
This pulls the work done by @marcoscaceres WIP #458 into the issue-456 branch, to complete the new add-meta-charset option.
This commit is contained in:
commit
8843199370
|
@ -648,6 +648,7 @@ typedef enum
|
|||
TidyXmlPIs, /**< If set to yes PIs must end with ?> */
|
||||
TidyXmlSpace, /**< If set to yes adds xml:space attr as needed */
|
||||
TidyXmlTags, /**< Treat input as XML */
|
||||
TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */
|
||||
N_TIDY_OPTIONS /**< Must be last */
|
||||
} TidyOptionId;
|
||||
|
||||
|
|
|
@ -184,6 +184,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
|
|||
#define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN )
|
||||
#define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING )
|
||||
#define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING )
|
||||
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
|
||||
#define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR )
|
||||
#define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF )
|
||||
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
|
||||
|
@ -385,6 +386,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
|
|||
#define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT )
|
||||
#define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR )
|
||||
#define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED )
|
||||
#define attrGetCHARSET( nod ) TY_(AttrGetById)( nod, TidyAttr_CHARSET )
|
||||
#define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED )
|
||||
#define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG )
|
||||
#define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET )
|
||||
|
|
|
@ -336,6 +336,7 @@ static const TidyOptionImpl option_defs[] =
|
|||
{ TidyXmlPIs, MU, "assume-xml-procins", BL, no, ParseBool, boolPicks },
|
||||
{ TidyXmlSpace, MU, "add-xml-space", BL, no, ParseBool, boolPicks },
|
||||
{ TidyXmlTags, MU, "input-xml", BL, no, ParseBool, boolPicks },
|
||||
{ TidyMetaCharset, MS, "add-meta-charset", BL, no, ParseBool, boolPicks }, /* 20161004 - Issue #456 */
|
||||
{ N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL }
|
||||
};
|
||||
|
||||
|
|
|
@ -1508,7 +1508,18 @@ static languageDefinition language_en = { whichPluralForm_en, {
|
|||
"This option specifies if Tidy should use the XML parser rather than the "
|
||||
"error correcting HTML parser. "
|
||||
},
|
||||
|
||||
{/* Important notes for translators:
|
||||
- Use only <code></code>, <var></var>, <em></em>, <strong></strong>, and
|
||||
<br/>.
|
||||
- Entities, tags, attributes, etc., should be enclosed in <code></code>.
|
||||
- Option values should be enclosed in <var></var>.
|
||||
- It's very important that <br/> be self-closing!
|
||||
- The strings "Tidy" and "HTML Tidy" are the program name and must not
|
||||
be translated. */
|
||||
TidyMetaCharset, 0,
|
||||
"This option adds a meta element and sets the charset attribute to the encoding of the document."
|
||||
"Set this option to 'yes' if you want this."
|
||||
},
|
||||
|
||||
/********************************************
|
||||
** TidyConfigCategory enumeration
|
||||
|
@ -1772,7 +1783,6 @@ static languageDefinition language_en = { whichPluralForm_en, {
|
|||
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md"
|
||||
},
|
||||
|
||||
|
||||
/********************************************
|
||||
** Report Output
|
||||
** @remark enum source TidyStrings
|
||||
|
@ -1887,7 +1897,6 @@ static languageDefinition language_en = { whichPluralForm_en, {
|
|||
{ UNEXPECTED_END_OF_FILE, 0, "unexpected end of file %s" }, /* ReportError, ReportAttrError */
|
||||
{ UNEXPECTED_ENDTAG, 0, "unexpected </%s>" }, /* ReportError, ReportFatal */
|
||||
|
||||
|
||||
#if SUPPORT_ACCESSIBILITY_CHECKS
|
||||
|
||||
/***************************************
|
||||
|
|
115
src/lexer.c
115
src/lexer.c
|
@ -1827,6 +1827,121 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
|
|||
return node;
|
||||
}
|
||||
|
||||
/* Check meta charset
|
||||
1. if there is no meta charset, it adds one.
|
||||
2. if there is a meta charset, it moves it to the top if HEAD.
|
||||
3. if it doesn't match the output encoding, warn about that.
|
||||
4. if there are duplicates, discard them.
|
||||
*/
|
||||
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||
{
|
||||
AttVal *charsetAttr;
|
||||
AttVal *contentAttr;
|
||||
AttVal *httpEquivAttr;
|
||||
Bool charsetFound = no;
|
||||
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
|
||||
Node *currentNode;
|
||||
Node *head = TY_(FindHEAD)( doc );
|
||||
Node *metaTag;
|
||||
Node *prevNode;
|
||||
TidyBuffer buf;
|
||||
TidyBuffer charsetString;
|
||||
tmbstr httpEquivAttrValue;
|
||||
tmbstr lcontent;
|
||||
tmbstr newValue;
|
||||
/* We can't do anything we don't have a head or encoding is NULL */
|
||||
if( !head || !enc )
|
||||
return no;
|
||||
tidyBufInit(&charsetString);
|
||||
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
|
||||
{
|
||||
if (!nodeIsMETA(currentNode))
|
||||
continue;
|
||||
charsetAttr = attrGetCHARSET(currentNode);
|
||||
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
|
||||
if(!charsetAttr && !httpEquivAttr)
|
||||
continue;
|
||||
/*
|
||||
Meta charset comes in quite a few flavors:
|
||||
1. <meta charset=value> - expected for (X)HTML5.
|
||||
*/
|
||||
if (charsetAttr && !httpEquivAttr)
|
||||
{
|
||||
// we already found one, so remove the rest.
|
||||
if(charsetFound)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)( doc, currentNode );
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
charsetFound = yes;
|
||||
// Fix mismatched attribute value
|
||||
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
|
||||
{
|
||||
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
|
||||
TY_(tmbstrcpy)( newValue, enc );
|
||||
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
||||
charsetAttr->value = newValue;
|
||||
}
|
||||
// Make sure it's the first element.
|
||||
if ( currentNode != head->content->next ){
|
||||
TY_(RemoveNode)( currentNode );
|
||||
TY_(InsertNodeAtStart)( head, currentNode );
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||
expected for HTML4. This is normally ok - but can clash.
|
||||
*/
|
||||
if(httpEquivAttr && !charsetAttr)
|
||||
{
|
||||
tidyBufClear(&charsetString);
|
||||
tidyBufAppend(&charsetString, "charset=", 8);
|
||||
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
|
||||
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
|
||||
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
||||
|
||||
if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
||||
continue;
|
||||
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
||||
if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
|
||||
printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
|
||||
}
|
||||
}
|
||||
/*
|
||||
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
||||
This is generally bad.
|
||||
*/
|
||||
if(httpEquivAttr && charsetAttr)
|
||||
{
|
||||
printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
|
||||
}
|
||||
}
|
||||
if(charsetFound){
|
||||
return yes;
|
||||
}
|
||||
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
||||
switch(TY_(HTMLVersion)(doc))
|
||||
{
|
||||
case HT50:
|
||||
case XH50:
|
||||
TY_(AddAttribute)( doc, metaTag, "charset", enc);
|
||||
break;
|
||||
default:
|
||||
tidyBufInit(&buf);
|
||||
tidyBufAppend(&buf, "text/html; charset=", 19);
|
||||
tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
|
||||
TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp);
|
||||
tidyBufFree(&buf);
|
||||
}
|
||||
TY_(InsertNodeAtStart)( head, metaTag );
|
||||
tidyBufFree(&charsetString);
|
||||
return yes;
|
||||
}
|
||||
|
||||
/* add meta element for Tidy */
|
||||
Bool TY_(AddGenerator)( TidyDocImpl* doc )
|
||||
{
|
||||
|
|
|
@ -491,6 +491,9 @@ Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
|
|||
/* Returns containing block element, if any */
|
||||
Node* TY_(FindContainer)( Node* node );
|
||||
|
||||
/* Adds meta element and sets the charset */
|
||||
Bool TY_(TidyMetaCharset)( TidyDocImpl* doc );
|
||||
|
||||
/* add meta element for Tidy */
|
||||
Bool TY_(AddGenerator)( TidyDocImpl* doc );
|
||||
|
||||
|
|
|
@ -1992,6 +1992,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
|||
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
|
||||
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
|
||||
Bool tidyMark = cfgBool( doc, TidyMark );
|
||||
Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
|
||||
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
|
||||
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
|
||||
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
|
||||
|
@ -2095,6 +2096,9 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
|||
|
||||
if (tidyMark )
|
||||
TY_(AddGenerator)(doc);
|
||||
|
||||
if (tidyMetaCharset)
|
||||
TY_(TidyMetaCharset)(doc);
|
||||
}
|
||||
|
||||
/* ensure presence of initial <?xml version="1.0"?> */
|
||||
|
|
Loading…
Reference in a new issue