Issue #456 - More work on this option

This commit is contained in:
Geoff McLane 2017-05-14 19:08:29 +02:00
parent 8843199370
commit 6ebd12be67
4 changed files with 126 additions and 51 deletions

View File

@ -2208,6 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
}
#endif
/* Issue #456 - This is discarded */
#if 0
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
{
Node *pNode;
@ -2283,6 +2285,7 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
pLastProp = NULL;
}
}
#endif
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
{

View File

@ -63,8 +63,10 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html );
#if 0
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
#endif
/* Issue #456 - This is discarded */
#if 0
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
#endif
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);

View File

@ -1827,19 +1827,21 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
return node;
}
/* Check meta charset
1. if there is no meta charset, it adds one.
2. if there is a meta charset, it moves it to the top if HEAD.
3. if it doesn't match the output encoding, warn about that.
4. if there are duplicates, discard them.
*/
/*\
* Issue #456 - Check meta charset
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
* 4. if there are duplicates, discard them, with warning.
\*/
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
{
AttVal *charsetAttr;
AttVal *contentAttr;
AttVal *httpEquivAttr;
Bool charsetFound = no;
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
uint outenc = cfg(doc, TidyOutCharEncoding);
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
Node *currentNode;
Node *head = TY_(FindHEAD)( doc );
Node *metaTag;
@ -1850,25 +1852,38 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
tmbstr lcontent;
tmbstr newValue;
/* We can't do anything we don't have a head or encoding is NULL */
if( !head || !enc )
if( !head || !enc || !TY_(tmbstrlen)(enc))
return no;
if (outenc == RAW)
return no;
#ifndef NO_NATIVE_ISO2022_SUPPORT
if (outenc == ISO2022)
return no;
#endif
tidyBufInit(&charsetString);
/* Set up the content test 'charset=value' */
tidyBufClear(&charsetString);
tidyBufAppend(&charsetString, "charset=", 8);
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
/* process the children of the head */
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
{
if (!nodeIsMETA(currentNode))
continue;
continue; /* not a meta node */
charsetAttr = attrGetCHARSET(currentNode);
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
if(!charsetAttr && !httpEquivAttr)
continue;
continue; /* has no charset attribute */
/*
Meta charset comes in quite a few flavors:
1. <meta charset=value> - expected for (X)HTML5.
1. <meta charset="value"> - expected for (X)HTML5.
*/
if (charsetAttr && !httpEquivAttr)
{
// we already found one, so remove the rest.
if(charsetFound)
/* we already found one, so remove the rest. */
if(charsetFound || !charsetAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
@ -1877,15 +1892,19 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
continue;
}
charsetFound = yes;
// Fix mismatched attribute value
/* Fix mismatched attribute value */
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
{
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
TY_(tmbstrcpy)( newValue, enc );
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
/* Note: previously http-equiv had been modified, without warning
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
*/
TidyDocFree(doc, charsetAttr->value); /* free current value */
charsetAttr->value = newValue;
}
// Make sure it's the first element.
/* Make sure it's the first element. */
if ( currentNode != head->content->next ){
TY_(RemoveNode)( currentNode );
TY_(InsertNodeAtStart)( head, currentNode );
@ -1893,51 +1912,110 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
continue;
}
/*
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
expected for HTML4. This is normally ok - but can clash.
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
expected for HTML4. This is normally ok - but can clash.
*/
if(httpEquivAttr && !charsetAttr)
{
tidyBufClear(&charsetString);
tidyBufAppend(&charsetString, "charset=", 8);
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
if (!contentAttr)
continue; /* has no 'content' attribute */
if (!httpEquivAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
lcontent = TY_(tmbstrtolower)(contentAttr->value);
if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
}
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
continue; /* is not 'content-type' */
if (!contentAttr->value)
{
prevNode = currentNode->prev;
/* maybe need better message here */
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
/* check encoding matches
If a miss-match found here, fix it. previous silently done
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
*/
lcontent = TY_(tmbstrtolower)(contentAttr->value);
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
{
/* we already found one, so remove the rest. */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
charsetFound = yes;
}
else
{
/* fix a mis-match */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
}
else
{
/* correct the content */
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
TidyDocFree(doc, contentAttr->value);
TY_(tmbstrcpy)(newValue, "text/html; charset=");
TY_(tmbstrcpy)(newValue + 19, enc);
contentAttr->value = newValue;
charsetFound = yes;
}
}
continue;
}
/*
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
This is generally bad.
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
This is generally bad. Discard and warn.
*/
if(httpEquivAttr && charsetAttr)
{
printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
}
}
if(charsetFound){
return yes;
}
metaTag = TY_(InferredTag)(doc, TidyTag_META);
switch(TY_(HTMLVersion)(doc))
/* completed head scan - add appropriate meta - if 'yes' and none exists */
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
{
/* add appropriate meta charset tag - no warning */
metaTag = TY_(InferredTag)(doc, TidyTag_META);
switch (TY_(HTMLVersion)(doc))
{
case HT50:
case XH50:
TY_(AddAttribute)( doc, metaTag, "charset", enc);
TY_(AddAttribute)(doc, metaTag, "charset", enc);
break;
default:
tidyBufInit(&buf);
tidyBufAppend(&buf, "text/html; charset=", 19);
tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp);
tidyBufAppend(&buf, "text/html; ", 11);
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
tidyBufFree(&buf);
}
TY_(InsertNodeAtStart)(head, metaTag);
}
TY_(InsertNodeAtStart)( head, metaTag );
tidyBufFree(&charsetString);
return yes;
}

View File

@ -1992,7 +1992,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
Bool tidyMark = cfgBool( doc, TidyMark );
Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
@ -2044,12 +2043,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
#endif
/* Reconcile http-equiv meta element with output encoding */
if (cfg( doc, TidyOutCharEncoding) != RAW
#ifndef NO_NATIVE_ISO2022_SUPPORT
&& cfg( doc, TidyOutCharEncoding) != ISO2022
#endif
)
TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc ));
TY_(TidyMetaCharset)(doc);
if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
TidyPanic( doc->allocator, integrity );
@ -2097,8 +2091,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
if (tidyMark )
TY_(AddGenerator)(doc);
if (tidyMetaCharset)
TY_(TidyMetaCharset)(doc);
}
/* ensure presence of initial <?xml version="1.0"?> */