diff --git a/src/clean.c b/src/clean.c
index 779ddec..8db77c7 100644
--- a/src/clean.c
+++ b/src/clean.c
@@ -2208,6 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
}
#endif
+/* Issue #456 - This is discarded */
+#if 0
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
{
Node *pNode;
@@ -2283,6 +2285,7 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
pLastProp = NULL;
}
}
+#endif
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
{
diff --git a/src/clean.h b/src/clean.h
index 00d4923..d5d4117 100644
--- a/src/clean.h
+++ b/src/clean.h
@@ -63,8 +63,10 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html );
#if 0
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
#endif
-
+/* Issue #456 - This is discarded */
+#if 0
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
+#endif
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
diff --git a/src/lexer.c b/src/lexer.c
index c2773dc..b3832d9 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -1827,19 +1827,21 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
return node;
}
-/* Check meta charset
- 1. if there is no meta charset, it adds one.
- 2. if there is a meta charset, it moves it to the top if HEAD.
- 3. if it doesn't match the output encoding, warn about that.
- 4. if there are duplicates, discard them.
- */
+/*\
+ * Issue #456 - Check meta charset
+ * 1. if there is no meta charset, it adds one, according to doctype, no warning.
+ * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
+ * 3. if it doesn't match the output encoding, and fix. Naybe no warning?
+ * 4. if there are duplicates, discard them, with warning.
+\*/
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
{
AttVal *charsetAttr;
AttVal *contentAttr;
AttVal *httpEquivAttr;
Bool charsetFound = no;
- ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
+ uint outenc = cfg(doc, TidyOutCharEncoding);
+ ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
Node *currentNode;
Node *head = TY_(FindHEAD)( doc );
Node *metaTag;
@@ -1850,25 +1852,38 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
tmbstr lcontent;
tmbstr newValue;
/* We can't do anything we don't have a head or encoding is NULL */
- if( !head || !enc )
+ if( !head || !enc || !TY_(tmbstrlen)(enc))
return no;
+ if (outenc == RAW)
+ return no;
+#ifndef NO_NATIVE_ISO2022_SUPPORT
+ if (outenc == ISO2022)
+ return no;
+#endif
+
tidyBufInit(&charsetString);
+ /* Set up the content test 'charset=value' */
+ tidyBufClear(&charsetString);
+ tidyBufAppend(&charsetString, "charset=", 8);
+ tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
+ tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
+ /* process the children of the head */
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
{
if (!nodeIsMETA(currentNode))
- continue;
+ continue; /* not a meta node */
charsetAttr = attrGetCHARSET(currentNode);
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
if(!charsetAttr && !httpEquivAttr)
- continue;
+ continue; /* has no charset attribute */
/*
Meta charset comes in quite a few flavors:
- 1. - expected for (X)HTML5.
+ 1. - expected for (X)HTML5.
*/
if (charsetAttr && !httpEquivAttr)
{
- // we already found one, so remove the rest.
- if(charsetFound)
+ /* we already found one, so remove the rest. */
+ if(charsetFound || !charsetAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
@@ -1877,15 +1892,19 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
continue;
}
charsetFound = yes;
- // Fix mismatched attribute value
+ /* Fix mismatched attribute value */
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
{
- newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
+ newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
TY_(tmbstrcpy)( newValue, enc );
- TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
+ /* Note: previously http-equiv had been modified, without warning
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
+ TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
+ */
+ TidyDocFree(doc, charsetAttr->value); /* free current value */
charsetAttr->value = newValue;
}
- // Make sure it's the first element.
+ /* Make sure it's the first element. */
if ( currentNode != head->content->next ){
TY_(RemoveNode)( currentNode );
TY_(InsertNodeAtStart)( head, currentNode );
@@ -1893,51 +1912,110 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
continue;
}
/*
- 2.
- expected for HTML4. This is normally ok - but can clash.
+ 2.
+ expected for HTML4. This is normally ok - but can clash.
*/
if(httpEquivAttr && !charsetAttr)
{
- tidyBufClear(&charsetString);
- tidyBufAppend(&charsetString, "charset=", 8);
- tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
- httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
-
- if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
+ if (!contentAttr)
+ continue; /* has no 'content' attribute */
+ if (!httpEquivAttr->value)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
continue;
- lcontent = TY_(tmbstrtolower)(contentAttr->value);
- if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
- printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
}
+ httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
+ if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
+ continue; /* is not 'content-type' */
+ if (!contentAttr->value)
+ {
+ prevNode = currentNode->prev;
+ /* maybe need better message here */
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ continue;
+ }
+ /* check encoding matches
+ If a miss-match found here, fix it. previous silently done
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
+ */
+ lcontent = TY_(tmbstrtolower)(contentAttr->value);
+ if (TY_(tmbsubstr)(lcontent, charsetString.bp))
+ {
+ /* we already found one, so remove the rest. */
+ if (charsetFound)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ continue;
+ }
+ charsetFound = yes;
+ }
+ else
+ {
+ /* fix a mis-match */
+ if (charsetFound)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ }
+ else
+ {
+ /* correct the content */
+ newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
+ TidyDocFree(doc, contentAttr->value);
+ TY_(tmbstrcpy)(newValue, "text/html; charset=");
+ TY_(tmbstrcpy)(newValue + 19, enc);
+ contentAttr->value = newValue;
+ charsetFound = yes;
+ }
+ }
+ continue;
}
/*
- 3.
- This is generally bad.
+ 3.
+ This is generally bad. Discard and warn.
*/
if(httpEquivAttr && charsetAttr)
{
- printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
+ /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
}
}
- if(charsetFound){
- return yes;
- }
- metaTag = TY_(InferredTag)(doc, TidyTag_META);
- switch(TY_(HTMLVersion)(doc))
+
+ /* completed head scan - add appropriate meta - if 'yes' and none exists */
+ if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
{
+ /* add appropriate meta charset tag - no warning */
+ metaTag = TY_(InferredTag)(doc, TidyTag_META);
+ switch (TY_(HTMLVersion)(doc))
+ {
case HT50:
case XH50:
- TY_(AddAttribute)( doc, metaTag, "charset", enc);
+ TY_(AddAttribute)(doc, metaTag, "charset", enc);
break;
default:
tidyBufInit(&buf);
- tidyBufAppend(&buf, "text/html; charset=", 19);
- tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
- TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp);
+ tidyBufAppend(&buf, "text/html; ", 11);
+ tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
+ tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
+ TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
tidyBufFree(&buf);
+ }
+ TY_(InsertNodeAtStart)(head, metaTag);
}
- TY_(InsertNodeAtStart)( head, metaTag );
tidyBufFree(&charsetString);
return yes;
}
diff --git a/src/tidylib.c b/src/tidylib.c
index 811721b..e2c443c 100755
--- a/src/tidylib.c
+++ b/src/tidylib.c
@@ -1992,7 +1992,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
Bool tidyMark = cfgBool( doc, TidyMark );
- Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
@@ -2044,12 +2043,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
#endif
/* Reconcile http-equiv meta element with output encoding */
- if (cfg( doc, TidyOutCharEncoding) != RAW
-#ifndef NO_NATIVE_ISO2022_SUPPORT
- && cfg( doc, TidyOutCharEncoding) != ISO2022
-#endif
- )
- TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc ));
+ TY_(TidyMetaCharset)(doc);
if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
TidyPanic( doc->allocator, integrity );
@@ -2097,8 +2091,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
if (tidyMark )
TY_(AddGenerator)(doc);
- if (tidyMetaCharset)
- TY_(TidyMetaCharset)(doc);
}
/* ensure presence of initial */