Issue #456 - Move new TidyMetaCharset to clean

This commit is contained in:
Geoff McLane 2017-05-15 16:39:53 +02:00
parent 6ebd12be67
commit f310f1d5de
3 changed files with 197 additions and 194 deletions

View file

@ -2208,7 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
} }
#endif #endif
/* Issue #456 - This is discarded */ /* Issue #456 - This is discarded
See replacement TidyMetaCharset */
#if 0 #if 0
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
{ {
@ -2287,6 +2288,200 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
} }
#endif #endif
/*\
* Issue #456 - Check meta charset
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
* 4. if there are duplicates, discard them, with warning.
\*/
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
{
AttVal *charsetAttr;
AttVal *contentAttr;
AttVal *httpEquivAttr;
Bool charsetFound = no;
uint outenc = cfg(doc, TidyOutCharEncoding);
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
Node *currentNode;
Node *head = TY_(FindHEAD)(doc);
Node *metaTag;
Node *prevNode;
TidyBuffer buf;
TidyBuffer charsetString;
tmbstr httpEquivAttrValue;
tmbstr lcontent;
tmbstr newValue;
/* We can't do anything we don't have a head or encoding is NULL */
if (!head || !enc || !TY_(tmbstrlen)(enc))
return no;
if (outenc == RAW)
return no;
#ifndef NO_NATIVE_ISO2022_SUPPORT
if (outenc == ISO2022)
return no;
#endif
tidyBufInit(&charsetString);
/* Set up the content test 'charset=value' */
tidyBufClear(&charsetString);
tidyBufAppend(&charsetString, "charset=", 8);
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
/* process the children of the head */
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
{
if (!nodeIsMETA(currentNode))
continue; /* not a meta node */
charsetAttr = attrGetCHARSET(currentNode);
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
if (!charsetAttr && !httpEquivAttr)
continue; /* has no charset attribute */
/*
Meta charset comes in quite a few flavors:
1. <meta charset="value"> - expected for (X)HTML5.
*/
if (charsetAttr && !httpEquivAttr)
{
/* we already found one, so remove the rest. */
if (charsetFound || !charsetAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
charsetFound = yes;
/* Fix mismatched attribute value */
if (TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
{
newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */
TY_(tmbstrcpy)(newValue, enc);
/* Note: previously http-equiv had been modified, without warning
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
*/
TidyDocFree(doc, charsetAttr->value); /* free current value */
charsetAttr->value = newValue;
}
/* Make sure it's the first element. */
if (currentNode != head->content->next) {
TY_(RemoveNode)(currentNode);
TY_(InsertNodeAtStart)(head, currentNode);
}
continue;
}
/*
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
expected for HTML4. This is normally ok - but can clash.
*/
if (httpEquivAttr && !charsetAttr)
{
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
if (!contentAttr)
continue; /* has no 'content' attribute */
if (!httpEquivAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
if (TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
continue; /* is not 'content-type' */
if (!contentAttr->value)
{
prevNode = currentNode->prev;
/* maybe need better message here */
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
/* check encoding matches
If a miss-match found here, fix it. previous silently done
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
*/
lcontent = TY_(tmbstrtolower)(contentAttr->value);
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
{
/* we already found one, so remove the rest. */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
charsetFound = yes;
}
else
{
/* fix a mis-match */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
}
else
{
/* correct the content */
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
TidyDocFree(doc, contentAttr->value);
TY_(tmbstrcpy)(newValue, "text/html; charset=");
TY_(tmbstrcpy)(newValue + 19, enc);
contentAttr->value = newValue;
charsetFound = yes;
}
}
continue;
}
/*
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
This is generally bad. Discard and warn.
*/
if (httpEquivAttr && charsetAttr)
{
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
}
}
/* completed head scan - add appropriate meta - if 'yes' and none exists */
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
{
/* add appropriate meta charset tag - no warning */
metaTag = TY_(InferredTag)(doc, TidyTag_META);
switch (TY_(HTMLVersion)(doc))
{
case HT50:
case XH50:
TY_(AddAttribute)(doc, metaTag, "charset", enc);
break;
default:
tidyBufInit(&buf);
tidyBufAppend(&buf, "text/html; ", 11);
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
tidyBufFree(&buf);
}
TY_(InsertNodeAtStart)(head, metaTag);
}
tidyBufFree(&charsetString);
return yes;
}
void TY_(DropComments)(TidyDocImpl* doc, Node* node) void TY_(DropComments)(TidyDocImpl* doc, Node* node)
{ {
Node* next; Node* next;

View file

@ -67,6 +67,7 @@ void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
#if 0 #if 0
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent ); void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
#endif #endif
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc);
void TY_(DropComments)(TidyDocImpl* doc, Node* node); void TY_(DropComments)(TidyDocImpl* doc, Node* node);
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode); void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);

View file

@ -1827,199 +1827,6 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
return node; return node;
} }
/*\
* Issue #456 - Check meta charset
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
* 4. if there are duplicates, discard them, with warning.
\*/
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
{
AttVal *charsetAttr;
AttVal *contentAttr;
AttVal *httpEquivAttr;
Bool charsetFound = no;
uint outenc = cfg(doc, TidyOutCharEncoding);
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
Node *currentNode;
Node *head = TY_(FindHEAD)( doc );
Node *metaTag;
Node *prevNode;
TidyBuffer buf;
TidyBuffer charsetString;
tmbstr httpEquivAttrValue;
tmbstr lcontent;
tmbstr newValue;
/* We can't do anything we don't have a head or encoding is NULL */
if( !head || !enc || !TY_(tmbstrlen)(enc))
return no;
if (outenc == RAW)
return no;
#ifndef NO_NATIVE_ISO2022_SUPPORT
if (outenc == ISO2022)
return no;
#endif
tidyBufInit(&charsetString);
/* Set up the content test 'charset=value' */
tidyBufClear(&charsetString);
tidyBufAppend(&charsetString, "charset=", 8);
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
/* process the children of the head */
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
{
if (!nodeIsMETA(currentNode))
continue; /* not a meta node */
charsetAttr = attrGetCHARSET(currentNode);
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
if(!charsetAttr && !httpEquivAttr)
continue; /* has no charset attribute */
/*
Meta charset comes in quite a few flavors:
1. <meta charset="value"> - expected for (X)HTML5.
*/
if (charsetAttr && !httpEquivAttr)
{
/* we already found one, so remove the rest. */
if(charsetFound || !charsetAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)( doc, currentNode );
currentNode = prevNode;
continue;
}
charsetFound = yes;
/* Fix mismatched attribute value */
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
{
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
TY_(tmbstrcpy)( newValue, enc );
/* Note: previously http-equiv had been modified, without warning
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
*/
TidyDocFree(doc, charsetAttr->value); /* free current value */
charsetAttr->value = newValue;
}
/* Make sure it's the first element. */
if ( currentNode != head->content->next ){
TY_(RemoveNode)( currentNode );
TY_(InsertNodeAtStart)( head, currentNode );
}
continue;
}
/*
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
expected for HTML4. This is normally ok - but can clash.
*/
if(httpEquivAttr && !charsetAttr)
{
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
if (!contentAttr)
continue; /* has no 'content' attribute */
if (!httpEquivAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
continue; /* is not 'content-type' */
if (!contentAttr->value)
{
prevNode = currentNode->prev;
/* maybe need better message here */
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
/* check encoding matches
If a miss-match found here, fix it. previous silently done
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
*/
lcontent = TY_(tmbstrtolower)(contentAttr->value);
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
{
/* we already found one, so remove the rest. */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
charsetFound = yes;
}
else
{
/* fix a mis-match */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
}
else
{
/* correct the content */
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
TidyDocFree(doc, contentAttr->value);
TY_(tmbstrcpy)(newValue, "text/html; charset=");
TY_(tmbstrcpy)(newValue + 19, enc);
contentAttr->value = newValue;
charsetFound = yes;
}
}
continue;
}
/*
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
This is generally bad. Discard and warn.
*/
if(httpEquivAttr && charsetAttr)
{
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
}
}
/* completed head scan - add appropriate meta - if 'yes' and none exists */
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
{
/* add appropriate meta charset tag - no warning */
metaTag = TY_(InferredTag)(doc, TidyTag_META);
switch (TY_(HTMLVersion)(doc))
{
case HT50:
case XH50:
TY_(AddAttribute)(doc, metaTag, "charset", enc);
break;
default:
tidyBufInit(&buf);
tidyBufAppend(&buf, "text/html; ", 11);
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
tidyBufFree(&buf);
}
TY_(InsertNodeAtStart)(head, metaTag);
}
tidyBufFree(&charsetString);
return yes;
}
/* add meta element for Tidy */ /* add meta element for Tidy */
Bool TY_(AddGenerator)( TidyDocImpl* doc ) Bool TY_(AddGenerator)( TidyDocImpl* doc )
{ {