Issue #456 - Move new TidyMetaCharset to clean
This commit is contained in:
parent
6ebd12be67
commit
f310f1d5de
197
src/clean.c
197
src/clean.c
|
@ -2208,7 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
|
|||
}
|
||||
#endif
|
||||
|
||||
/* Issue #456 - This is discarded */
|
||||
/* Issue #456 - This is discarded
|
||||
See replacement TidyMetaCharset */
|
||||
#if 0
|
||||
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||
{
|
||||
|
@ -2287,6 +2288,200 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
|||
}
|
||||
#endif
|
||||
|
||||
/*\
|
||||
* Issue #456 - Check meta charset
|
||||
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
|
||||
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
|
||||
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
|
||||
* 4. if there are duplicates, discard them, with warning.
|
||||
\*/
|
||||
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||
{
|
||||
AttVal *charsetAttr;
|
||||
AttVal *contentAttr;
|
||||
AttVal *httpEquivAttr;
|
||||
Bool charsetFound = no;
|
||||
uint outenc = cfg(doc, TidyOutCharEncoding);
|
||||
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
|
||||
Node *currentNode;
|
||||
Node *head = TY_(FindHEAD)(doc);
|
||||
Node *metaTag;
|
||||
Node *prevNode;
|
||||
TidyBuffer buf;
|
||||
TidyBuffer charsetString;
|
||||
tmbstr httpEquivAttrValue;
|
||||
tmbstr lcontent;
|
||||
tmbstr newValue;
|
||||
/* We can't do anything we don't have a head or encoding is NULL */
|
||||
if (!head || !enc || !TY_(tmbstrlen)(enc))
|
||||
return no;
|
||||
if (outenc == RAW)
|
||||
return no;
|
||||
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||
if (outenc == ISO2022)
|
||||
return no;
|
||||
#endif
|
||||
|
||||
tidyBufInit(&charsetString);
|
||||
/* Set up the content test 'charset=value' */
|
||||
tidyBufClear(&charsetString);
|
||||
tidyBufAppend(&charsetString, "charset=", 8);
|
||||
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
|
||||
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
|
||||
/* process the children of the head */
|
||||
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
|
||||
{
|
||||
if (!nodeIsMETA(currentNode))
|
||||
continue; /* not a meta node */
|
||||
charsetAttr = attrGetCHARSET(currentNode);
|
||||
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
|
||||
if (!charsetAttr && !httpEquivAttr)
|
||||
continue; /* has no charset attribute */
|
||||
/*
|
||||
Meta charset comes in quite a few flavors:
|
||||
1. <meta charset="value"> - expected for (X)HTML5.
|
||||
*/
|
||||
if (charsetAttr && !httpEquivAttr)
|
||||
{
|
||||
/* we already found one, so remove the rest. */
|
||||
if (charsetFound || !charsetAttr->value)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
charsetFound = yes;
|
||||
/* Fix mismatched attribute value */
|
||||
if (TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
|
||||
{
|
||||
newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */
|
||||
TY_(tmbstrcpy)(newValue, enc);
|
||||
/* Note: previously http-equiv had been modified, without warning
|
||||
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
||||
*/
|
||||
TidyDocFree(doc, charsetAttr->value); /* free current value */
|
||||
charsetAttr->value = newValue;
|
||||
}
|
||||
/* Make sure it's the first element. */
|
||||
if (currentNode != head->content->next) {
|
||||
TY_(RemoveNode)(currentNode);
|
||||
TY_(InsertNodeAtStart)(head, currentNode);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||
expected for HTML4. This is normally ok - but can clash.
|
||||
*/
|
||||
if (httpEquivAttr && !charsetAttr)
|
||||
{
|
||||
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
|
||||
if (!contentAttr)
|
||||
continue; /* has no 'content' attribute */
|
||||
if (!httpEquivAttr->value)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
||||
if (TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
||||
continue; /* is not 'content-type' */
|
||||
if (!contentAttr->value)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
/* maybe need better message here */
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
/* check encoding matches
|
||||
If a miss-match found here, fix it. previous silently done
|
||||
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||
*/
|
||||
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
||||
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
|
||||
{
|
||||
/* we already found one, so remove the rest. */
|
||||
if (charsetFound)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
charsetFound = yes;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* fix a mis-match */
|
||||
if (charsetFound)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* correct the content */
|
||||
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
|
||||
TidyDocFree(doc, contentAttr->value);
|
||||
TY_(tmbstrcpy)(newValue, "text/html; charset=");
|
||||
TY_(tmbstrcpy)(newValue + 19, enc);
|
||||
contentAttr->value = newValue;
|
||||
charsetFound = yes;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
||||
This is generally bad. Discard and warn.
|
||||
*/
|
||||
if (httpEquivAttr && charsetAttr)
|
||||
{
|
||||
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
}
|
||||
}
|
||||
|
||||
/* completed head scan - add appropriate meta - if 'yes' and none exists */
|
||||
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
|
||||
{
|
||||
/* add appropriate meta charset tag - no warning */
|
||||
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
||||
switch (TY_(HTMLVersion)(doc))
|
||||
{
|
||||
case HT50:
|
||||
case XH50:
|
||||
TY_(AddAttribute)(doc, metaTag, "charset", enc);
|
||||
break;
|
||||
default:
|
||||
tidyBufInit(&buf);
|
||||
tidyBufAppend(&buf, "text/html; ", 11);
|
||||
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
|
||||
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
|
||||
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
|
||||
tidyBufFree(&buf);
|
||||
}
|
||||
TY_(InsertNodeAtStart)(head, metaTag);
|
||||
}
|
||||
tidyBufFree(&charsetString);
|
||||
return yes;
|
||||
}
|
||||
|
||||
|
||||
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
|
||||
{
|
||||
Node* next;
|
||||
|
|
|
@ -67,6 +67,7 @@ void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
|
|||
#if 0
|
||||
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
|
||||
#endif
|
||||
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc);
|
||||
|
||||
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
|
||||
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
|
||||
|
|
193
src/lexer.c
193
src/lexer.c
|
@ -1827,199 +1827,6 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
|
|||
return node;
|
||||
}
|
||||
|
||||
/*\
|
||||
* Issue #456 - Check meta charset
|
||||
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
|
||||
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
|
||||
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
|
||||
* 4. if there are duplicates, discard them, with warning.
|
||||
\*/
|
||||
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||
{
|
||||
AttVal *charsetAttr;
|
||||
AttVal *contentAttr;
|
||||
AttVal *httpEquivAttr;
|
||||
Bool charsetFound = no;
|
||||
uint outenc = cfg(doc, TidyOutCharEncoding);
|
||||
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
|
||||
Node *currentNode;
|
||||
Node *head = TY_(FindHEAD)( doc );
|
||||
Node *metaTag;
|
||||
Node *prevNode;
|
||||
TidyBuffer buf;
|
||||
TidyBuffer charsetString;
|
||||
tmbstr httpEquivAttrValue;
|
||||
tmbstr lcontent;
|
||||
tmbstr newValue;
|
||||
/* We can't do anything we don't have a head or encoding is NULL */
|
||||
if( !head || !enc || !TY_(tmbstrlen)(enc))
|
||||
return no;
|
||||
if (outenc == RAW)
|
||||
return no;
|
||||
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||
if (outenc == ISO2022)
|
||||
return no;
|
||||
#endif
|
||||
|
||||
tidyBufInit(&charsetString);
|
||||
/* Set up the content test 'charset=value' */
|
||||
tidyBufClear(&charsetString);
|
||||
tidyBufAppend(&charsetString, "charset=", 8);
|
||||
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
|
||||
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
|
||||
/* process the children of the head */
|
||||
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
|
||||
{
|
||||
if (!nodeIsMETA(currentNode))
|
||||
continue; /* not a meta node */
|
||||
charsetAttr = attrGetCHARSET(currentNode);
|
||||
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
|
||||
if(!charsetAttr && !httpEquivAttr)
|
||||
continue; /* has no charset attribute */
|
||||
/*
|
||||
Meta charset comes in quite a few flavors:
|
||||
1. <meta charset="value"> - expected for (X)HTML5.
|
||||
*/
|
||||
if (charsetAttr && !httpEquivAttr)
|
||||
{
|
||||
/* we already found one, so remove the rest. */
|
||||
if(charsetFound || !charsetAttr->value)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)( doc, currentNode );
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
charsetFound = yes;
|
||||
/* Fix mismatched attribute value */
|
||||
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
|
||||
{
|
||||
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
|
||||
TY_(tmbstrcpy)( newValue, enc );
|
||||
/* Note: previously http-equiv had been modified, without warning
|
||||
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
||||
*/
|
||||
TidyDocFree(doc, charsetAttr->value); /* free current value */
|
||||
charsetAttr->value = newValue;
|
||||
}
|
||||
/* Make sure it's the first element. */
|
||||
if ( currentNode != head->content->next ){
|
||||
TY_(RemoveNode)( currentNode );
|
||||
TY_(InsertNodeAtStart)( head, currentNode );
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||
expected for HTML4. This is normally ok - but can clash.
|
||||
*/
|
||||
if(httpEquivAttr && !charsetAttr)
|
||||
{
|
||||
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
|
||||
if (!contentAttr)
|
||||
continue; /* has no 'content' attribute */
|
||||
if (!httpEquivAttr->value)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
||||
if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
||||
continue; /* is not 'content-type' */
|
||||
if (!contentAttr->value)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
/* maybe need better message here */
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
/* check encoding matches
|
||||
If a miss-match found here, fix it. previous silently done
|
||||
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||
*/
|
||||
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
||||
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
|
||||
{
|
||||
/* we already found one, so remove the rest. */
|
||||
if (charsetFound)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
charsetFound = yes;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* fix a mis-match */
|
||||
if (charsetFound)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* correct the content */
|
||||
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
|
||||
TidyDocFree(doc, contentAttr->value);
|
||||
TY_(tmbstrcpy)(newValue, "text/html; charset=");
|
||||
TY_(tmbstrcpy)(newValue + 19, enc);
|
||||
contentAttr->value = newValue;
|
||||
charsetFound = yes;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
||||
This is generally bad. Discard and warn.
|
||||
*/
|
||||
if(httpEquivAttr && charsetAttr)
|
||||
{
|
||||
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
}
|
||||
}
|
||||
|
||||
/* completed head scan - add appropriate meta - if 'yes' and none exists */
|
||||
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
|
||||
{
|
||||
/* add appropriate meta charset tag - no warning */
|
||||
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
||||
switch (TY_(HTMLVersion)(doc))
|
||||
{
|
||||
case HT50:
|
||||
case XH50:
|
||||
TY_(AddAttribute)(doc, metaTag, "charset", enc);
|
||||
break;
|
||||
default:
|
||||
tidyBufInit(&buf);
|
||||
tidyBufAppend(&buf, "text/html; ", 11);
|
||||
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
|
||||
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
|
||||
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
|
||||
tidyBufFree(&buf);
|
||||
}
|
||||
TY_(InsertNodeAtStart)(head, metaTag);
|
||||
}
|
||||
tidyBufFree(&charsetString);
|
||||
return yes;
|
||||
}
|
||||
|
||||
/* add meta element for Tidy */
|
||||
Bool TY_(AddGenerator)( TidyDocImpl* doc )
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue