diff --git a/src/clean.c b/src/clean.c
index 8db77c7..0abf53a 100644
--- a/src/clean.c
+++ b/src/clean.c
@@ -2208,7 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
}
#endif
-/* Issue #456 - This is discarded */
+/* Issue #456 - This is discarded
+ See replacement TidyMetaCharset */
#if 0
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
{
@@ -2287,6 +2288,200 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
}
#endif
+/*\
+* Issue #456 - Check meta charset
+* 1. if there is no meta charset, it adds one, according to doctype, no warning.
+* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
+* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
+* 4. if there are duplicates, discard them, with warning.
+\*/
+Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
+{
+ AttVal *charsetAttr;
+ AttVal *contentAttr;
+ AttVal *httpEquivAttr;
+ Bool charsetFound = no;
+ uint outenc = cfg(doc, TidyOutCharEncoding);
+ ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
+ Node *currentNode;
+ Node *head = TY_(FindHEAD)(doc);
+ Node *metaTag;
+ Node *prevNode;
+ TidyBuffer buf;
+ TidyBuffer charsetString;
+ tmbstr httpEquivAttrValue;
+ tmbstr lcontent;
+ tmbstr newValue;
+ /* We can't do anything we don't have a head or encoding is NULL */
+ if (!head || !enc || !TY_(tmbstrlen)(enc))
+ return no;
+ if (outenc == RAW)
+ return no;
+#ifndef NO_NATIVE_ISO2022_SUPPORT
+ if (outenc == ISO2022)
+ return no;
+#endif
+
+ tidyBufInit(&charsetString);
+ /* Set up the content test 'charset=value' */
+ tidyBufClear(&charsetString);
+ tidyBufAppend(&charsetString, "charset=", 8);
+ tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
+ tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
+ /* process the children of the head */
+ for (currentNode = head->content; currentNode; currentNode = currentNode->next)
+ {
+ if (!nodeIsMETA(currentNode))
+ continue; /* not a meta node */
+ charsetAttr = attrGetCHARSET(currentNode);
+ httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
+ if (!charsetAttr && !httpEquivAttr)
+ continue; /* has no charset attribute */
+ /*
+ Meta charset comes in quite a few flavors:
+ 1. - expected for (X)HTML5.
+ */
+ if (charsetAttr && !httpEquivAttr)
+ {
+ /* we already found one, so remove the rest. */
+ if (charsetFound || !charsetAttr->value)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ continue;
+ }
+ charsetFound = yes;
+ /* Fix mismatched attribute value */
+ if (TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
+ {
+ newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */
+ TY_(tmbstrcpy)(newValue, enc);
+ /* Note: previously http-equiv had been modified, without warning
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
+ TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
+ */
+ TidyDocFree(doc, charsetAttr->value); /* free current value */
+ charsetAttr->value = newValue;
+ }
+ /* Make sure it's the first element. */
+ if (currentNode != head->content->next) {
+ TY_(RemoveNode)(currentNode);
+ TY_(InsertNodeAtStart)(head, currentNode);
+ }
+ continue;
+ }
+ /*
+ 2.
+ expected for HTML4. This is normally ok - but can clash.
+ */
+ if (httpEquivAttr && !charsetAttr)
+ {
+ contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
+ if (!contentAttr)
+ continue; /* has no 'content' attribute */
+ if (!httpEquivAttr->value)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ continue;
+ }
+ httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
+ if (TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
+ continue; /* is not 'content-type' */
+ if (!contentAttr->value)
+ {
+ prevNode = currentNode->prev;
+ /* maybe need better message here */
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ continue;
+ }
+ /* check encoding matches
+ If a miss-match found here, fix it. previous silently done
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
+ */
+ lcontent = TY_(tmbstrtolower)(contentAttr->value);
+ if (TY_(tmbsubstr)(lcontent, charsetString.bp))
+ {
+ /* we already found one, so remove the rest. */
+ if (charsetFound)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ continue;
+ }
+ charsetFound = yes;
+ }
+ else
+ {
+ /* fix a mis-match */
+ if (charsetFound)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ }
+ else
+ {
+ /* correct the content */
+ newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
+ TidyDocFree(doc, contentAttr->value);
+ TY_(tmbstrcpy)(newValue, "text/html; charset=");
+ TY_(tmbstrcpy)(newValue + 19, enc);
+ contentAttr->value = newValue;
+ charsetFound = yes;
+ }
+ }
+ continue;
+ }
+ /*
+ 3.
+ This is generally bad. Discard and warn.
+ */
+ if (httpEquivAttr && charsetAttr)
+ {
+ /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
+ prevNode = currentNode->prev;
+ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ }
+ }
+
+ /* completed head scan - add appropriate meta - if 'yes' and none exists */
+ if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
+ {
+ /* add appropriate meta charset tag - no warning */
+ metaTag = TY_(InferredTag)(doc, TidyTag_META);
+ switch (TY_(HTMLVersion)(doc))
+ {
+ case HT50:
+ case XH50:
+ TY_(AddAttribute)(doc, metaTag, "charset", enc);
+ break;
+ default:
+ tidyBufInit(&buf);
+ tidyBufAppend(&buf, "text/html; ", 11);
+ tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
+ tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
+ TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
+ tidyBufFree(&buf);
+ }
+ TY_(InsertNodeAtStart)(head, metaTag);
+ }
+ tidyBufFree(&charsetString);
+ return yes;
+}
+
+
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
{
Node* next;
diff --git a/src/clean.h b/src/clean.h
index d5d4117..e538bcf 100644
--- a/src/clean.h
+++ b/src/clean.h
@@ -67,6 +67,7 @@ void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
#if 0
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
#endif
+Bool TY_(TidyMetaCharset)(TidyDocImpl* doc);
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
diff --git a/src/lexer.c b/src/lexer.c
index b3832d9..238fbfa 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -1827,199 +1827,6 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
return node;
}
-/*\
- * Issue #456 - Check meta charset
- * 1. if there is no meta charset, it adds one, according to doctype, no warning.
- * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
- * 3. if it doesn't match the output encoding, and fix. Naybe no warning?
- * 4. if there are duplicates, discard them, with warning.
-\*/
-Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
-{
- AttVal *charsetAttr;
- AttVal *contentAttr;
- AttVal *httpEquivAttr;
- Bool charsetFound = no;
- uint outenc = cfg(doc, TidyOutCharEncoding);
- ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
- Node *currentNode;
- Node *head = TY_(FindHEAD)( doc );
- Node *metaTag;
- Node *prevNode;
- TidyBuffer buf;
- TidyBuffer charsetString;
- tmbstr httpEquivAttrValue;
- tmbstr lcontent;
- tmbstr newValue;
- /* We can't do anything we don't have a head or encoding is NULL */
- if( !head || !enc || !TY_(tmbstrlen)(enc))
- return no;
- if (outenc == RAW)
- return no;
-#ifndef NO_NATIVE_ISO2022_SUPPORT
- if (outenc == ISO2022)
- return no;
-#endif
-
- tidyBufInit(&charsetString);
- /* Set up the content test 'charset=value' */
- tidyBufClear(&charsetString);
- tidyBufAppend(&charsetString, "charset=", 8);
- tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
- tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
- /* process the children of the head */
- for (currentNode = head->content; currentNode; currentNode = currentNode->next)
- {
- if (!nodeIsMETA(currentNode))
- continue; /* not a meta node */
- charsetAttr = attrGetCHARSET(currentNode);
- httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
- if(!charsetAttr && !httpEquivAttr)
- continue; /* has no charset attribute */
- /*
- Meta charset comes in quite a few flavors:
- 1. - expected for (X)HTML5.
- */
- if (charsetAttr && !httpEquivAttr)
- {
- /* we already found one, so remove the rest. */
- if(charsetFound || !charsetAttr->value)
- {
- prevNode = currentNode->prev;
- TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
- TY_(DiscardElement)( doc, currentNode );
- currentNode = prevNode;
- continue;
- }
- charsetFound = yes;
- /* Fix mismatched attribute value */
- if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
- {
- newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
- TY_(tmbstrcpy)( newValue, enc );
- /* Note: previously http-equiv had been modified, without warning
- in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
- TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
- */
- TidyDocFree(doc, charsetAttr->value); /* free current value */
- charsetAttr->value = newValue;
- }
- /* Make sure it's the first element. */
- if ( currentNode != head->content->next ){
- TY_(RemoveNode)( currentNode );
- TY_(InsertNodeAtStart)( head, currentNode );
- }
- continue;
- }
- /*
- 2.
- expected for HTML4. This is normally ok - but can clash.
- */
- if(httpEquivAttr && !charsetAttr)
- {
- contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
- if (!contentAttr)
- continue; /* has no 'content' attribute */
- if (!httpEquivAttr->value)
- {
- prevNode = currentNode->prev;
- TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
- TY_(DiscardElement)(doc, currentNode);
- currentNode = prevNode;
- continue;
- }
- httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
- if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
- continue; /* is not 'content-type' */
- if (!contentAttr->value)
- {
- prevNode = currentNode->prev;
- /* maybe need better message here */
- TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
- TY_(DiscardElement)(doc, currentNode);
- currentNode = prevNode;
- continue;
- }
- /* check encoding matches
- If a miss-match found here, fix it. previous silently done
- in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
- */
- lcontent = TY_(tmbstrtolower)(contentAttr->value);
- if (TY_(tmbsubstr)(lcontent, charsetString.bp))
- {
- /* we already found one, so remove the rest. */
- if (charsetFound)
- {
- prevNode = currentNode->prev;
- TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
- TY_(DiscardElement)(doc, currentNode);
- currentNode = prevNode;
- continue;
- }
- charsetFound = yes;
- }
- else
- {
- /* fix a mis-match */
- if (charsetFound)
- {
- prevNode = currentNode->prev;
- TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
- TY_(DiscardElement)(doc, currentNode);
- currentNode = prevNode;
- }
- else
- {
- /* correct the content */
- newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
- TidyDocFree(doc, contentAttr->value);
- TY_(tmbstrcpy)(newValue, "text/html; charset=");
- TY_(tmbstrcpy)(newValue + 19, enc);
- contentAttr->value = newValue;
- charsetFound = yes;
- }
- }
- continue;
- }
- /*
- 3.
- This is generally bad. Discard and warn.
- */
- if(httpEquivAttr && charsetAttr)
- {
- /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
- prevNode = currentNode->prev;
- TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
- TY_(DiscardElement)(doc, currentNode);
- currentNode = prevNode;
- }
- }
-
- /* completed head scan - add appropriate meta - if 'yes' and none exists */
- if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
- {
- /* add appropriate meta charset tag - no warning */
- metaTag = TY_(InferredTag)(doc, TidyTag_META);
- switch (TY_(HTMLVersion)(doc))
- {
- case HT50:
- case XH50:
- TY_(AddAttribute)(doc, metaTag, "charset", enc);
- break;
- default:
- tidyBufInit(&buf);
- tidyBufAppend(&buf, "text/html; ", 11);
- tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
- tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
- TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
- tidyBufFree(&buf);
- }
- TY_(InsertNodeAtStart)(head, metaTag);
- }
- tidyBufFree(&charsetString);
- return yes;
-}
-
/* add meta element for Tidy */
Bool TY_(AddGenerator)( TidyDocImpl* doc )
{