Issue #456 - Move new TidyMetaCharset to clean
This commit is contained in:
parent
6ebd12be67
commit
f310f1d5de
197
src/clean.c
197
src/clean.c
|
@ -2208,7 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Issue #456 - This is discarded */
|
/* Issue #456 - This is discarded
|
||||||
|
See replacement TidyMetaCharset */
|
||||||
#if 0
|
#if 0
|
||||||
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||||
{
|
{
|
||||||
|
@ -2287,6 +2288,200 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*\
|
||||||
|
* Issue #456 - Check meta charset
|
||||||
|
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
|
||||||
|
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
|
||||||
|
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
|
||||||
|
* 4. if there are duplicates, discard them, with warning.
|
||||||
|
\*/
|
||||||
|
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||||
|
{
|
||||||
|
AttVal *charsetAttr;
|
||||||
|
AttVal *contentAttr;
|
||||||
|
AttVal *httpEquivAttr;
|
||||||
|
Bool charsetFound = no;
|
||||||
|
uint outenc = cfg(doc, TidyOutCharEncoding);
|
||||||
|
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
|
||||||
|
Node *currentNode;
|
||||||
|
Node *head = TY_(FindHEAD)(doc);
|
||||||
|
Node *metaTag;
|
||||||
|
Node *prevNode;
|
||||||
|
TidyBuffer buf;
|
||||||
|
TidyBuffer charsetString;
|
||||||
|
tmbstr httpEquivAttrValue;
|
||||||
|
tmbstr lcontent;
|
||||||
|
tmbstr newValue;
|
||||||
|
/* We can't do anything we don't have a head or encoding is NULL */
|
||||||
|
if (!head || !enc || !TY_(tmbstrlen)(enc))
|
||||||
|
return no;
|
||||||
|
if (outenc == RAW)
|
||||||
|
return no;
|
||||||
|
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||||
|
if (outenc == ISO2022)
|
||||||
|
return no;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
tidyBufInit(&charsetString);
|
||||||
|
/* Set up the content test 'charset=value' */
|
||||||
|
tidyBufClear(&charsetString);
|
||||||
|
tidyBufAppend(&charsetString, "charset=", 8);
|
||||||
|
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
|
||||||
|
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
|
||||||
|
/* process the children of the head */
|
||||||
|
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
|
||||||
|
{
|
||||||
|
if (!nodeIsMETA(currentNode))
|
||||||
|
continue; /* not a meta node */
|
||||||
|
charsetAttr = attrGetCHARSET(currentNode);
|
||||||
|
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
|
||||||
|
if (!charsetAttr && !httpEquivAttr)
|
||||||
|
continue; /* has no charset attribute */
|
||||||
|
/*
|
||||||
|
Meta charset comes in quite a few flavors:
|
||||||
|
1. <meta charset="value"> - expected for (X)HTML5.
|
||||||
|
*/
|
||||||
|
if (charsetAttr && !httpEquivAttr)
|
||||||
|
{
|
||||||
|
/* we already found one, so remove the rest. */
|
||||||
|
if (charsetFound || !charsetAttr->value)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
charsetFound = yes;
|
||||||
|
/* Fix mismatched attribute value */
|
||||||
|
if (TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
|
||||||
|
{
|
||||||
|
newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */
|
||||||
|
TY_(tmbstrcpy)(newValue, enc);
|
||||||
|
/* Note: previously http-equiv had been modified, without warning
|
||||||
|
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||||
|
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
||||||
|
*/
|
||||||
|
TidyDocFree(doc, charsetAttr->value); /* free current value */
|
||||||
|
charsetAttr->value = newValue;
|
||||||
|
}
|
||||||
|
/* Make sure it's the first element. */
|
||||||
|
if (currentNode != head->content->next) {
|
||||||
|
TY_(RemoveNode)(currentNode);
|
||||||
|
TY_(InsertNodeAtStart)(head, currentNode);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||||
|
expected for HTML4. This is normally ok - but can clash.
|
||||||
|
*/
|
||||||
|
if (httpEquivAttr && !charsetAttr)
|
||||||
|
{
|
||||||
|
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
|
||||||
|
if (!contentAttr)
|
||||||
|
continue; /* has no 'content' attribute */
|
||||||
|
if (!httpEquivAttr->value)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
||||||
|
if (TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
||||||
|
continue; /* is not 'content-type' */
|
||||||
|
if (!contentAttr->value)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
/* maybe need better message here */
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* check encoding matches
|
||||||
|
If a miss-match found here, fix it. previous silently done
|
||||||
|
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||||
|
*/
|
||||||
|
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
||||||
|
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
|
||||||
|
{
|
||||||
|
/* we already found one, so remove the rest. */
|
||||||
|
if (charsetFound)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
charsetFound = yes;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* fix a mis-match */
|
||||||
|
if (charsetFound)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* correct the content */
|
||||||
|
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
|
||||||
|
TidyDocFree(doc, contentAttr->value);
|
||||||
|
TY_(tmbstrcpy)(newValue, "text/html; charset=");
|
||||||
|
TY_(tmbstrcpy)(newValue + 19, enc);
|
||||||
|
contentAttr->value = newValue;
|
||||||
|
charsetFound = yes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
||||||
|
This is generally bad. Discard and warn.
|
||||||
|
*/
|
||||||
|
if (httpEquivAttr && charsetAttr)
|
||||||
|
{
|
||||||
|
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* completed head scan - add appropriate meta - if 'yes' and none exists */
|
||||||
|
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
|
||||||
|
{
|
||||||
|
/* add appropriate meta charset tag - no warning */
|
||||||
|
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
||||||
|
switch (TY_(HTMLVersion)(doc))
|
||||||
|
{
|
||||||
|
case HT50:
|
||||||
|
case XH50:
|
||||||
|
TY_(AddAttribute)(doc, metaTag, "charset", enc);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
tidyBufInit(&buf);
|
||||||
|
tidyBufAppend(&buf, "text/html; ", 11);
|
||||||
|
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
|
||||||
|
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
|
||||||
|
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
|
||||||
|
tidyBufFree(&buf);
|
||||||
|
}
|
||||||
|
TY_(InsertNodeAtStart)(head, metaTag);
|
||||||
|
}
|
||||||
|
tidyBufFree(&charsetString);
|
||||||
|
return yes;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
|
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
|
||||||
{
|
{
|
||||||
Node* next;
|
Node* next;
|
||||||
|
|
|
@ -67,6 +67,7 @@ void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
|
||||||
#if 0
|
#if 0
|
||||||
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
|
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
|
||||||
#endif
|
#endif
|
||||||
|
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc);
|
||||||
|
|
||||||
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
|
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
|
||||||
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
|
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
|
||||||
|
|
193
src/lexer.c
193
src/lexer.c
|
@ -1827,199 +1827,6 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*\
|
|
||||||
* Issue #456 - Check meta charset
|
|
||||||
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
|
|
||||||
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
|
|
||||||
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
|
|
||||||
* 4. if there are duplicates, discard them, with warning.
|
|
||||||
\*/
|
|
||||||
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
|
||||||
{
|
|
||||||
AttVal *charsetAttr;
|
|
||||||
AttVal *contentAttr;
|
|
||||||
AttVal *httpEquivAttr;
|
|
||||||
Bool charsetFound = no;
|
|
||||||
uint outenc = cfg(doc, TidyOutCharEncoding);
|
|
||||||
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
|
|
||||||
Node *currentNode;
|
|
||||||
Node *head = TY_(FindHEAD)( doc );
|
|
||||||
Node *metaTag;
|
|
||||||
Node *prevNode;
|
|
||||||
TidyBuffer buf;
|
|
||||||
TidyBuffer charsetString;
|
|
||||||
tmbstr httpEquivAttrValue;
|
|
||||||
tmbstr lcontent;
|
|
||||||
tmbstr newValue;
|
|
||||||
/* We can't do anything we don't have a head or encoding is NULL */
|
|
||||||
if( !head || !enc || !TY_(tmbstrlen)(enc))
|
|
||||||
return no;
|
|
||||||
if (outenc == RAW)
|
|
||||||
return no;
|
|
||||||
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
|
||||||
if (outenc == ISO2022)
|
|
||||||
return no;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
tidyBufInit(&charsetString);
|
|
||||||
/* Set up the content test 'charset=value' */
|
|
||||||
tidyBufClear(&charsetString);
|
|
||||||
tidyBufAppend(&charsetString, "charset=", 8);
|
|
||||||
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
|
|
||||||
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
|
|
||||||
/* process the children of the head */
|
|
||||||
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
|
|
||||||
{
|
|
||||||
if (!nodeIsMETA(currentNode))
|
|
||||||
continue; /* not a meta node */
|
|
||||||
charsetAttr = attrGetCHARSET(currentNode);
|
|
||||||
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
|
|
||||||
if(!charsetAttr && !httpEquivAttr)
|
|
||||||
continue; /* has no charset attribute */
|
|
||||||
/*
|
|
||||||
Meta charset comes in quite a few flavors:
|
|
||||||
1. <meta charset="value"> - expected for (X)HTML5.
|
|
||||||
*/
|
|
||||||
if (charsetAttr && !httpEquivAttr)
|
|
||||||
{
|
|
||||||
/* we already found one, so remove the rest. */
|
|
||||||
if(charsetFound || !charsetAttr->value)
|
|
||||||
{
|
|
||||||
prevNode = currentNode->prev;
|
|
||||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
|
||||||
TY_(DiscardElement)( doc, currentNode );
|
|
||||||
currentNode = prevNode;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
charsetFound = yes;
|
|
||||||
/* Fix mismatched attribute value */
|
|
||||||
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
|
|
||||||
{
|
|
||||||
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
|
|
||||||
TY_(tmbstrcpy)( newValue, enc );
|
|
||||||
/* Note: previously http-equiv had been modified, without warning
|
|
||||||
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
|
||||||
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
|
||||||
*/
|
|
||||||
TidyDocFree(doc, charsetAttr->value); /* free current value */
|
|
||||||
charsetAttr->value = newValue;
|
|
||||||
}
|
|
||||||
/* Make sure it's the first element. */
|
|
||||||
if ( currentNode != head->content->next ){
|
|
||||||
TY_(RemoveNode)( currentNode );
|
|
||||||
TY_(InsertNodeAtStart)( head, currentNode );
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
|
||||||
expected for HTML4. This is normally ok - but can clash.
|
|
||||||
*/
|
|
||||||
if(httpEquivAttr && !charsetAttr)
|
|
||||||
{
|
|
||||||
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
|
|
||||||
if (!contentAttr)
|
|
||||||
continue; /* has no 'content' attribute */
|
|
||||||
if (!httpEquivAttr->value)
|
|
||||||
{
|
|
||||||
prevNode = currentNode->prev;
|
|
||||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
|
||||||
TY_(DiscardElement)(doc, currentNode);
|
|
||||||
currentNode = prevNode;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
|
||||||
if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
|
||||||
continue; /* is not 'content-type' */
|
|
||||||
if (!contentAttr->value)
|
|
||||||
{
|
|
||||||
prevNode = currentNode->prev;
|
|
||||||
/* maybe need better message here */
|
|
||||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
|
||||||
TY_(DiscardElement)(doc, currentNode);
|
|
||||||
currentNode = prevNode;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* check encoding matches
|
|
||||||
If a miss-match found here, fix it. previous silently done
|
|
||||||
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
|
||||||
*/
|
|
||||||
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
|
||||||
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
|
|
||||||
{
|
|
||||||
/* we already found one, so remove the rest. */
|
|
||||||
if (charsetFound)
|
|
||||||
{
|
|
||||||
prevNode = currentNode->prev;
|
|
||||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
|
||||||
TY_(DiscardElement)(doc, currentNode);
|
|
||||||
currentNode = prevNode;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
charsetFound = yes;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* fix a mis-match */
|
|
||||||
if (charsetFound)
|
|
||||||
{
|
|
||||||
prevNode = currentNode->prev;
|
|
||||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
|
||||||
TY_(DiscardElement)(doc, currentNode);
|
|
||||||
currentNode = prevNode;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* correct the content */
|
|
||||||
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
|
|
||||||
TidyDocFree(doc, contentAttr->value);
|
|
||||||
TY_(tmbstrcpy)(newValue, "text/html; charset=");
|
|
||||||
TY_(tmbstrcpy)(newValue + 19, enc);
|
|
||||||
contentAttr->value = newValue;
|
|
||||||
charsetFound = yes;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
|
||||||
This is generally bad. Discard and warn.
|
|
||||||
*/
|
|
||||||
if(httpEquivAttr && charsetAttr)
|
|
||||||
{
|
|
||||||
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
|
|
||||||
prevNode = currentNode->prev;
|
|
||||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
|
||||||
TY_(DiscardElement)(doc, currentNode);
|
|
||||||
currentNode = prevNode;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* completed head scan - add appropriate meta - if 'yes' and none exists */
|
|
||||||
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
|
|
||||||
{
|
|
||||||
/* add appropriate meta charset tag - no warning */
|
|
||||||
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
|
||||||
switch (TY_(HTMLVersion)(doc))
|
|
||||||
{
|
|
||||||
case HT50:
|
|
||||||
case XH50:
|
|
||||||
TY_(AddAttribute)(doc, metaTag, "charset", enc);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
tidyBufInit(&buf);
|
|
||||||
tidyBufAppend(&buf, "text/html; ", 11);
|
|
||||||
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
|
|
||||||
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
|
|
||||||
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
|
|
||||||
tidyBufFree(&buf);
|
|
||||||
}
|
|
||||||
TY_(InsertNodeAtStart)(head, metaTag);
|
|
||||||
}
|
|
||||||
tidyBufFree(&charsetString);
|
|
||||||
return yes;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* add meta element for Tidy */
|
/* add meta element for Tidy */
|
||||||
Bool TY_(AddGenerator)( TidyDocImpl* doc )
|
Bool TY_(AddGenerator)( TidyDocImpl* doc )
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in a new issue