Issue #456 - More work on this option

This commit is contained in:
Geoff McLane 2017-05-14 19:08:29 +02:00
parent 8843199370
commit 6ebd12be67
4 changed files with 126 additions and 51 deletions

View file

@ -2208,6 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
} }
#endif #endif
/* Issue #456 - This is discarded */
#if 0
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
{ {
Node *pNode; Node *pNode;
@ -2283,6 +2285,7 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
pLastProp = NULL; pLastProp = NULL;
} }
} }
#endif
void TY_(DropComments)(TidyDocImpl* doc, Node* node) void TY_(DropComments)(TidyDocImpl* doc, Node* node)
{ {

View file

@ -63,8 +63,10 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html );
#if 0 #if 0
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent ); void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
#endif #endif
/* Issue #456 - This is discarded */
#if 0
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent ); void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
#endif
void TY_(DropComments)(TidyDocImpl* doc, Node* node); void TY_(DropComments)(TidyDocImpl* doc, Node* node);
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode); void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);

View file

@ -1827,19 +1827,21 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
return node; return node;
} }
/* Check meta charset /*\
1. if there is no meta charset, it adds one. * Issue #456 - Check meta charset
2. if there is a meta charset, it moves it to the top if HEAD. * 1. if there is no meta charset, it adds one, according to doctype, no warning.
3. if it doesn't match the output encoding, warn about that. * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
4. if there are duplicates, discard them. * 3. if it doesn't match the output encoding, and fix. Naybe no warning?
*/ * 4. if there are duplicates, discard them, with warning.
\*/
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
{ {
AttVal *charsetAttr; AttVal *charsetAttr;
AttVal *contentAttr; AttVal *contentAttr;
AttVal *httpEquivAttr; AttVal *httpEquivAttr;
Bool charsetFound = no; Bool charsetFound = no;
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); uint outenc = cfg(doc, TidyOutCharEncoding);
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
Node *currentNode; Node *currentNode;
Node *head = TY_(FindHEAD)( doc ); Node *head = TY_(FindHEAD)( doc );
Node *metaTag; Node *metaTag;
@ -1850,25 +1852,38 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
tmbstr lcontent; tmbstr lcontent;
tmbstr newValue; tmbstr newValue;
/* We can't do anything we don't have a head or encoding is NULL */ /* We can't do anything we don't have a head or encoding is NULL */
if( !head || !enc ) if( !head || !enc || !TY_(tmbstrlen)(enc))
return no; return no;
if (outenc == RAW)
return no;
#ifndef NO_NATIVE_ISO2022_SUPPORT
if (outenc == ISO2022)
return no;
#endif
tidyBufInit(&charsetString); tidyBufInit(&charsetString);
/* Set up the content test 'charset=value' */
tidyBufClear(&charsetString);
tidyBufAppend(&charsetString, "charset=", 8);
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
/* process the children of the head */
for (currentNode = head->content; currentNode; currentNode = currentNode->next) for (currentNode = head->content; currentNode; currentNode = currentNode->next)
{ {
if (!nodeIsMETA(currentNode)) if (!nodeIsMETA(currentNode))
continue; continue; /* not a meta node */
charsetAttr = attrGetCHARSET(currentNode); charsetAttr = attrGetCHARSET(currentNode);
httpEquivAttr = attrGetHTTP_EQUIV(currentNode); httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
if(!charsetAttr && !httpEquivAttr) if(!charsetAttr && !httpEquivAttr)
continue; continue; /* has no charset attribute */
/* /*
Meta charset comes in quite a few flavors: Meta charset comes in quite a few flavors:
1. <meta charset=value> - expected for (X)HTML5. 1. <meta charset="value"> - expected for (X)HTML5.
*/ */
if (charsetAttr && !httpEquivAttr) if (charsetAttr && !httpEquivAttr)
{ {
// we already found one, so remove the rest. /* we already found one, so remove the rest. */
if(charsetFound) if(charsetFound || !charsetAttr->value)
{ {
prevNode = currentNode->prev; prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
@ -1877,15 +1892,19 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
continue; continue;
} }
charsetFound = yes; charsetFound = yes;
// Fix mismatched attribute value /* Fix mismatched attribute value */
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
{ {
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
TY_(tmbstrcpy)( newValue, enc ); TY_(tmbstrcpy)( newValue, enc );
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); /* Note: previously http-equiv had been modified, without warning
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
*/
TidyDocFree(doc, charsetAttr->value); /* free current value */
charsetAttr->value = newValue; charsetAttr->value = newValue;
} }
// Make sure it's the first element. /* Make sure it's the first element. */
if ( currentNode != head->content->next ){ if ( currentNode != head->content->next ){
TY_(RemoveNode)( currentNode ); TY_(RemoveNode)( currentNode );
TY_(InsertNodeAtStart)( head, currentNode ); TY_(InsertNodeAtStart)( head, currentNode );
@ -1893,51 +1912,110 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
continue; continue;
} }
/* /*
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8"> 2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
expected for HTML4. This is normally ok - but can clash. expected for HTML4. This is normally ok - but can clash.
*/ */
if(httpEquivAttr && !charsetAttr) if(httpEquivAttr && !charsetAttr)
{ {
tidyBufClear(&charsetString);
tidyBufAppend(&charsetString, "charset=", 8);
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); if (!contentAttr)
continue; /* has no 'content' attribute */
if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) if (!httpEquivAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue; continue;
lcontent = TY_(tmbstrtolower)(contentAttr->value);
if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
} }
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
continue; /* is not 'content-type' */
if (!contentAttr->value)
{
prevNode = currentNode->prev;
/* maybe need better message here */
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
/* check encoding matches
If a miss-match found here, fix it. previous silently done
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
*/
lcontent = TY_(tmbstrtolower)(contentAttr->value);
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
{
/* we already found one, so remove the rest. */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
charsetFound = yes;
}
else
{
/* fix a mis-match */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
}
else
{
/* correct the content */
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
TidyDocFree(doc, contentAttr->value);
TY_(tmbstrcpy)(newValue, "text/html; charset=");
TY_(tmbstrcpy)(newValue + 19, enc);
contentAttr->value = newValue;
charsetFound = yes;
}
}
continue;
} }
/* /*
3. <meta charset="utf-8" http-equiv="Content-Type" content="..."> 3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
This is generally bad. This is generally bad. Discard and warn.
*/ */
if(httpEquivAttr && charsetAttr) if(httpEquivAttr && charsetAttr)
{ {
printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
} }
} }
if(charsetFound){
return yes; /* completed head scan - add appropriate meta - if 'yes' and none exists */
} if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
metaTag = TY_(InferredTag)(doc, TidyTag_META);
switch(TY_(HTMLVersion)(doc))
{ {
/* add appropriate meta charset tag - no warning */
metaTag = TY_(InferredTag)(doc, TidyTag_META);
switch (TY_(HTMLVersion)(doc))
{
case HT50: case HT50:
case XH50: case XH50:
TY_(AddAttribute)( doc, metaTag, "charset", enc); TY_(AddAttribute)(doc, metaTag, "charset", enc);
break; break;
default: default:
tidyBufInit(&buf); tidyBufInit(&buf);
tidyBufAppend(&buf, "text/html; charset=", 19); tidyBufAppend(&buf, "text/html; ", 11);
tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc)); tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp); tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
tidyBufFree(&buf); tidyBufFree(&buf);
}
TY_(InsertNodeAtStart)(head, metaTag);
} }
TY_(InsertNodeAtStart)( head, metaTag );
tidyBufFree(&charsetString); tidyBufFree(&charsetString);
return yes; return yes;
} }

View file

@ -1992,7 +1992,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut ); Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
Bool xmlDecl = cfgBool( doc, TidyXmlDecl ); Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
Bool tidyMark = cfgBool( doc, TidyMark ); Bool tidyMark = cfgBool( doc, TidyMark );
Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags ); Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName ); Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis ); Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
@ -2044,12 +2043,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
#endif #endif
/* Reconcile http-equiv meta element with output encoding */ /* Reconcile http-equiv meta element with output encoding */
if (cfg( doc, TidyOutCharEncoding) != RAW TY_(TidyMetaCharset)(doc);
#ifndef NO_NATIVE_ISO2022_SUPPORT
&& cfg( doc, TidyOutCharEncoding) != ISO2022
#endif
)
TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc ));
if ( !TY_(CheckNodeIntegrity)( &doc->root ) ) if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
TidyPanic( doc->allocator, integrity ); TidyPanic( doc->allocator, integrity );
@ -2097,8 +2091,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
if (tidyMark ) if (tidyMark )
TY_(AddGenerator)(doc); TY_(AddGenerator)(doc);
if (tidyMetaCharset)
TY_(TidyMetaCharset)(doc);
} }
/* ensure presence of initial <?xml version="1.0"?> */ /* ensure presence of initial <?xml version="1.0"?> */