Issue #456 - More work on this option
This commit is contained in:
parent
8843199370
commit
6ebd12be67
|
@ -2208,6 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Issue #456 - This is discarded */
|
||||||
|
#if 0
|
||||||
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||||
{
|
{
|
||||||
Node *pNode;
|
Node *pNode;
|
||||||
|
@ -2283,6 +2285,7 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||||
pLastProp = NULL;
|
pLastProp = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
|
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
|
||||||
{
|
{
|
||||||
|
|
|
@ -63,8 +63,10 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html );
|
||||||
#if 0
|
#if 0
|
||||||
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
|
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
|
||||||
#endif
|
#endif
|
||||||
|
/* Issue #456 - This is discarded */
|
||||||
|
#if 0
|
||||||
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
|
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
|
||||||
|
#endif
|
||||||
|
|
||||||
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
|
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
|
||||||
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
|
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
|
||||||
|
|
142
src/lexer.c
142
src/lexer.c
|
@ -1827,19 +1827,21 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Check meta charset
|
/*\
|
||||||
1. if there is no meta charset, it adds one.
|
* Issue #456 - Check meta charset
|
||||||
2. if there is a meta charset, it moves it to the top if HEAD.
|
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
|
||||||
3. if it doesn't match the output encoding, warn about that.
|
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
|
||||||
4. if there are duplicates, discard them.
|
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
|
||||||
*/
|
* 4. if there are duplicates, discard them, with warning.
|
||||||
|
\*/
|
||||||
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||||
{
|
{
|
||||||
AttVal *charsetAttr;
|
AttVal *charsetAttr;
|
||||||
AttVal *contentAttr;
|
AttVal *contentAttr;
|
||||||
AttVal *httpEquivAttr;
|
AttVal *httpEquivAttr;
|
||||||
Bool charsetFound = no;
|
Bool charsetFound = no;
|
||||||
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
|
uint outenc = cfg(doc, TidyOutCharEncoding);
|
||||||
|
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
|
||||||
Node *currentNode;
|
Node *currentNode;
|
||||||
Node *head = TY_(FindHEAD)( doc );
|
Node *head = TY_(FindHEAD)( doc );
|
||||||
Node *metaTag;
|
Node *metaTag;
|
||||||
|
@ -1850,25 +1852,38 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||||
tmbstr lcontent;
|
tmbstr lcontent;
|
||||||
tmbstr newValue;
|
tmbstr newValue;
|
||||||
/* We can't do anything we don't have a head or encoding is NULL */
|
/* We can't do anything we don't have a head or encoding is NULL */
|
||||||
if( !head || !enc )
|
if( !head || !enc || !TY_(tmbstrlen)(enc))
|
||||||
return no;
|
return no;
|
||||||
|
if (outenc == RAW)
|
||||||
|
return no;
|
||||||
|
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||||
|
if (outenc == ISO2022)
|
||||||
|
return no;
|
||||||
|
#endif
|
||||||
|
|
||||||
tidyBufInit(&charsetString);
|
tidyBufInit(&charsetString);
|
||||||
|
/* Set up the content test 'charset=value' */
|
||||||
|
tidyBufClear(&charsetString);
|
||||||
|
tidyBufAppend(&charsetString, "charset=", 8);
|
||||||
|
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
|
||||||
|
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
|
||||||
|
/* process the children of the head */
|
||||||
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
|
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
|
||||||
{
|
{
|
||||||
if (!nodeIsMETA(currentNode))
|
if (!nodeIsMETA(currentNode))
|
||||||
continue;
|
continue; /* not a meta node */
|
||||||
charsetAttr = attrGetCHARSET(currentNode);
|
charsetAttr = attrGetCHARSET(currentNode);
|
||||||
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
|
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
|
||||||
if(!charsetAttr && !httpEquivAttr)
|
if(!charsetAttr && !httpEquivAttr)
|
||||||
continue;
|
continue; /* has no charset attribute */
|
||||||
/*
|
/*
|
||||||
Meta charset comes in quite a few flavors:
|
Meta charset comes in quite a few flavors:
|
||||||
1. <meta charset=value> - expected for (X)HTML5.
|
1. <meta charset="value"> - expected for (X)HTML5.
|
||||||
*/
|
*/
|
||||||
if (charsetAttr && !httpEquivAttr)
|
if (charsetAttr && !httpEquivAttr)
|
||||||
{
|
{
|
||||||
// we already found one, so remove the rest.
|
/* we already found one, so remove the rest. */
|
||||||
if(charsetFound)
|
if(charsetFound || !charsetAttr->value)
|
||||||
{
|
{
|
||||||
prevNode = currentNode->prev;
|
prevNode = currentNode->prev;
|
||||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
@ -1877,15 +1892,19 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
charsetFound = yes;
|
charsetFound = yes;
|
||||||
// Fix mismatched attribute value
|
/* Fix mismatched attribute value */
|
||||||
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
|
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
|
||||||
{
|
{
|
||||||
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
|
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
|
||||||
TY_(tmbstrcpy)( newValue, enc );
|
TY_(tmbstrcpy)( newValue, enc );
|
||||||
|
/* Note: previously http-equiv had been modified, without warning
|
||||||
|
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||||
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
||||||
|
*/
|
||||||
|
TidyDocFree(doc, charsetAttr->value); /* free current value */
|
||||||
charsetAttr->value = newValue;
|
charsetAttr->value = newValue;
|
||||||
}
|
}
|
||||||
// Make sure it's the first element.
|
/* Make sure it's the first element. */
|
||||||
if ( currentNode != head->content->next ){
|
if ( currentNode != head->content->next ){
|
||||||
TY_(RemoveNode)( currentNode );
|
TY_(RemoveNode)( currentNode );
|
||||||
TY_(InsertNodeAtStart)( head, currentNode );
|
TY_(InsertNodeAtStart)( head, currentNode );
|
||||||
|
@ -1898,31 +1917,88 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||||
*/
|
*/
|
||||||
if(httpEquivAttr && !charsetAttr)
|
if(httpEquivAttr && !charsetAttr)
|
||||||
{
|
{
|
||||||
tidyBufClear(&charsetString);
|
|
||||||
tidyBufAppend(&charsetString, "charset=", 8);
|
|
||||||
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
|
|
||||||
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
|
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
|
||||||
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
if (!contentAttr)
|
||||||
|
continue; /* has no 'content' attribute */
|
||||||
if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
if (!httpEquivAttr->value)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
continue;
|
continue;
|
||||||
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
|
||||||
if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
|
|
||||||
printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
|
|
||||||
}
|
}
|
||||||
|
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
||||||
|
if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
||||||
|
continue; /* is not 'content-type' */
|
||||||
|
if (!contentAttr->value)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
/* maybe need better message here */
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* check encoding matches
|
||||||
|
If a miss-match found here, fix it. previous silently done
|
||||||
|
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||||
|
*/
|
||||||
|
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
||||||
|
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
|
||||||
|
{
|
||||||
|
/* we already found one, so remove the rest. */
|
||||||
|
if (charsetFound)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
charsetFound = yes;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* fix a mis-match */
|
||||||
|
if (charsetFound)
|
||||||
|
{
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* correct the content */
|
||||||
|
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
|
||||||
|
TidyDocFree(doc, contentAttr->value);
|
||||||
|
TY_(tmbstrcpy)(newValue, "text/html; charset=");
|
||||||
|
TY_(tmbstrcpy)(newValue + 19, enc);
|
||||||
|
contentAttr->value = newValue;
|
||||||
|
charsetFound = yes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
||||||
This is generally bad.
|
This is generally bad. Discard and warn.
|
||||||
*/
|
*/
|
||||||
if(httpEquivAttr && charsetAttr)
|
if(httpEquivAttr && charsetAttr)
|
||||||
{
|
{
|
||||||
printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
|
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
|
||||||
|
prevNode = currentNode->prev;
|
||||||
|
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||||
|
TY_(DiscardElement)(doc, currentNode);
|
||||||
|
currentNode = prevNode;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(charsetFound){
|
|
||||||
return yes;
|
/* completed head scan - add appropriate meta - if 'yes' and none exists */
|
||||||
}
|
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
|
||||||
|
{
|
||||||
|
/* add appropriate meta charset tag - no warning */
|
||||||
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
||||||
switch (TY_(HTMLVersion)(doc))
|
switch (TY_(HTMLVersion)(doc))
|
||||||
{
|
{
|
||||||
|
@ -1932,12 +2008,14 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
tidyBufInit(&buf);
|
tidyBufInit(&buf);
|
||||||
tidyBufAppend(&buf, "text/html; charset=", 19);
|
tidyBufAppend(&buf, "text/html; ", 11);
|
||||||
tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
|
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
|
||||||
|
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
|
||||||
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
|
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
|
||||||
tidyBufFree(&buf);
|
tidyBufFree(&buf);
|
||||||
}
|
}
|
||||||
TY_(InsertNodeAtStart)(head, metaTag);
|
TY_(InsertNodeAtStart)(head, metaTag);
|
||||||
|
}
|
||||||
tidyBufFree(&charsetString);
|
tidyBufFree(&charsetString);
|
||||||
return yes;
|
return yes;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1992,7 +1992,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
||||||
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
|
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
|
||||||
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
|
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
|
||||||
Bool tidyMark = cfgBool( doc, TidyMark );
|
Bool tidyMark = cfgBool( doc, TidyMark );
|
||||||
Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
|
|
||||||
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
|
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
|
||||||
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
|
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
|
||||||
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
|
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
|
||||||
|
@ -2044,12 +2043,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Reconcile http-equiv meta element with output encoding */
|
/* Reconcile http-equiv meta element with output encoding */
|
||||||
if (cfg( doc, TidyOutCharEncoding) != RAW
|
TY_(TidyMetaCharset)(doc);
|
||||||
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
|
||||||
&& cfg( doc, TidyOutCharEncoding) != ISO2022
|
|
||||||
#endif
|
|
||||||
)
|
|
||||||
TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc ));
|
|
||||||
|
|
||||||
if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
|
if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
|
||||||
TidyPanic( doc->allocator, integrity );
|
TidyPanic( doc->allocator, integrity );
|
||||||
|
@ -2097,8 +2091,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
||||||
if (tidyMark )
|
if (tidyMark )
|
||||||
TY_(AddGenerator)(doc);
|
TY_(AddGenerator)(doc);
|
||||||
|
|
||||||
if (tidyMetaCharset)
|
|
||||||
TY_(TidyMetaCharset)(doc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ensure presence of initial <?xml version="1.0"?> */
|
/* ensure presence of initial <?xml version="1.0"?> */
|
||||||
|
|
Loading…
Reference in a new issue