Issue #456 - More work on this option
This commit is contained in:
parent
8843199370
commit
6ebd12be67
|
@ -2208,6 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
|
|||
}
|
||||
#endif
|
||||
|
||||
/* Issue #456 - This is discarded */
|
||||
#if 0
|
||||
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||
{
|
||||
Node *pNode;
|
||||
|
@ -2283,6 +2285,7 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
|||
pLastProp = NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
|
||||
{
|
||||
|
|
|
@ -63,8 +63,10 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html );
|
|||
#if 0
|
||||
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
|
||||
#endif
|
||||
|
||||
/* Issue #456 - This is discarded */
|
||||
#if 0
|
||||
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
|
||||
#endif
|
||||
|
||||
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
|
||||
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
|
||||
|
|
160
src/lexer.c
160
src/lexer.c
|
@ -1827,19 +1827,21 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
|
|||
return node;
|
||||
}
|
||||
|
||||
/* Check meta charset
|
||||
1. if there is no meta charset, it adds one.
|
||||
2. if there is a meta charset, it moves it to the top if HEAD.
|
||||
3. if it doesn't match the output encoding, warn about that.
|
||||
4. if there are duplicates, discard them.
|
||||
*/
|
||||
/*\
|
||||
* Issue #456 - Check meta charset
|
||||
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
|
||||
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
|
||||
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
|
||||
* 4. if there are duplicates, discard them, with warning.
|
||||
\*/
|
||||
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
||||
{
|
||||
AttVal *charsetAttr;
|
||||
AttVal *contentAttr;
|
||||
AttVal *httpEquivAttr;
|
||||
Bool charsetFound = no;
|
||||
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
|
||||
uint outenc = cfg(doc, TidyOutCharEncoding);
|
||||
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
|
||||
Node *currentNode;
|
||||
Node *head = TY_(FindHEAD)( doc );
|
||||
Node *metaTag;
|
||||
|
@ -1850,25 +1852,38 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
|||
tmbstr lcontent;
|
||||
tmbstr newValue;
|
||||
/* We can't do anything we don't have a head or encoding is NULL */
|
||||
if( !head || !enc )
|
||||
if( !head || !enc || !TY_(tmbstrlen)(enc))
|
||||
return no;
|
||||
if (outenc == RAW)
|
||||
return no;
|
||||
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||
if (outenc == ISO2022)
|
||||
return no;
|
||||
#endif
|
||||
|
||||
tidyBufInit(&charsetString);
|
||||
/* Set up the content test 'charset=value' */
|
||||
tidyBufClear(&charsetString);
|
||||
tidyBufAppend(&charsetString, "charset=", 8);
|
||||
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
|
||||
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
|
||||
/* process the children of the head */
|
||||
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
|
||||
{
|
||||
if (!nodeIsMETA(currentNode))
|
||||
continue;
|
||||
continue; /* not a meta node */
|
||||
charsetAttr = attrGetCHARSET(currentNode);
|
||||
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
|
||||
if(!charsetAttr && !httpEquivAttr)
|
||||
continue;
|
||||
continue; /* has no charset attribute */
|
||||
/*
|
||||
Meta charset comes in quite a few flavors:
|
||||
1. <meta charset=value> - expected for (X)HTML5.
|
||||
1. <meta charset="value"> - expected for (X)HTML5.
|
||||
*/
|
||||
if (charsetAttr && !httpEquivAttr)
|
||||
{
|
||||
// we already found one, so remove the rest.
|
||||
if(charsetFound)
|
||||
/* we already found one, so remove the rest. */
|
||||
if(charsetFound || !charsetAttr->value)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
|
@ -1877,15 +1892,19 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
|||
continue;
|
||||
}
|
||||
charsetFound = yes;
|
||||
// Fix mismatched attribute value
|
||||
/* Fix mismatched attribute value */
|
||||
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
|
||||
{
|
||||
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
|
||||
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
|
||||
TY_(tmbstrcpy)( newValue, enc );
|
||||
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
||||
/* Note: previously http-equiv had been modified, without warning
|
||||
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
|
||||
*/
|
||||
TidyDocFree(doc, charsetAttr->value); /* free current value */
|
||||
charsetAttr->value = newValue;
|
||||
}
|
||||
// Make sure it's the first element.
|
||||
/* Make sure it's the first element. */
|
||||
if ( currentNode != head->content->next ){
|
||||
TY_(RemoveNode)( currentNode );
|
||||
TY_(InsertNodeAtStart)( head, currentNode );
|
||||
|
@ -1893,51 +1912,110 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
|
|||
continue;
|
||||
}
|
||||
/*
|
||||
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||
expected for HTML4. This is normally ok - but can clash.
|
||||
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||
expected for HTML4. This is normally ok - but can clash.
|
||||
*/
|
||||
if(httpEquivAttr && !charsetAttr)
|
||||
{
|
||||
tidyBufClear(&charsetString);
|
||||
tidyBufAppend(&charsetString, "charset=", 8);
|
||||
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
|
||||
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
|
||||
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
||||
|
||||
if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
||||
if (!contentAttr)
|
||||
continue; /* has no 'content' attribute */
|
||||
if (!httpEquivAttr->value)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
||||
if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
|
||||
printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
|
||||
}
|
||||
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
|
||||
if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
|
||||
continue; /* is not 'content-type' */
|
||||
if (!contentAttr->value)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
/* maybe need better message here */
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
/* check encoding matches
|
||||
If a miss-match found here, fix it. previous silently done
|
||||
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
|
||||
*/
|
||||
lcontent = TY_(tmbstrtolower)(contentAttr->value);
|
||||
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
|
||||
{
|
||||
/* we already found one, so remove the rest. */
|
||||
if (charsetFound)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
continue;
|
||||
}
|
||||
charsetFound = yes;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* fix a mis-match */
|
||||
if (charsetFound)
|
||||
{
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* correct the content */
|
||||
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
|
||||
TidyDocFree(doc, contentAttr->value);
|
||||
TY_(tmbstrcpy)(newValue, "text/html; charset=");
|
||||
TY_(tmbstrcpy)(newValue + 19, enc);
|
||||
contentAttr->value = newValue;
|
||||
charsetFound = yes;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
||||
This is generally bad.
|
||||
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
|
||||
This is generally bad. Discard and warn.
|
||||
*/
|
||||
if(httpEquivAttr && charsetAttr)
|
||||
{
|
||||
printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
|
||||
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
|
||||
prevNode = currentNode->prev;
|
||||
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
|
||||
TY_(DiscardElement)(doc, currentNode);
|
||||
currentNode = prevNode;
|
||||
}
|
||||
}
|
||||
if(charsetFound){
|
||||
return yes;
|
||||
}
|
||||
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
||||
switch(TY_(HTMLVersion)(doc))
|
||||
|
||||
/* completed head scan - add appropriate meta - if 'yes' and none exists */
|
||||
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
|
||||
{
|
||||
/* add appropriate meta charset tag - no warning */
|
||||
metaTag = TY_(InferredTag)(doc, TidyTag_META);
|
||||
switch (TY_(HTMLVersion)(doc))
|
||||
{
|
||||
case HT50:
|
||||
case XH50:
|
||||
TY_(AddAttribute)( doc, metaTag, "charset", enc);
|
||||
TY_(AddAttribute)(doc, metaTag, "charset", enc);
|
||||
break;
|
||||
default:
|
||||
tidyBufInit(&buf);
|
||||
tidyBufAppend(&buf, "text/html; charset=", 19);
|
||||
tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
|
||||
TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp);
|
||||
tidyBufAppend(&buf, "text/html; ", 11);
|
||||
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
|
||||
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
|
||||
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
|
||||
tidyBufFree(&buf);
|
||||
}
|
||||
TY_(InsertNodeAtStart)(head, metaTag);
|
||||
}
|
||||
TY_(InsertNodeAtStart)( head, metaTag );
|
||||
tidyBufFree(&charsetString);
|
||||
return yes;
|
||||
}
|
||||
|
|
|
@ -1992,7 +1992,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
|||
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
|
||||
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
|
||||
Bool tidyMark = cfgBool( doc, TidyMark );
|
||||
Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
|
||||
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
|
||||
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
|
||||
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
|
||||
|
@ -2044,12 +2043,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
|||
#endif
|
||||
|
||||
/* Reconcile http-equiv meta element with output encoding */
|
||||
if (cfg( doc, TidyOutCharEncoding) != RAW
|
||||
#ifndef NO_NATIVE_ISO2022_SUPPORT
|
||||
&& cfg( doc, TidyOutCharEncoding) != ISO2022
|
||||
#endif
|
||||
)
|
||||
TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc ));
|
||||
TY_(TidyMetaCharset)(doc);
|
||||
|
||||
if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
|
||||
TidyPanic( doc->allocator, integrity );
|
||||
|
@ -2097,8 +2091,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
|
|||
if (tidyMark )
|
||||
TY_(AddGenerator)(doc);
|
||||
|
||||
if (tidyMetaCharset)
|
||||
TY_(TidyMetaCharset)(doc);
|
||||
}
|
||||
|
||||
/* ensure presence of initial <?xml version="1.0"?> */
|
||||
|
|
Loading…
Reference in a new issue