From f310f1d5de8ba7ce8adc1659b0e586a83ca8e47f Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Mon, 15 May 2017 16:39:53 +0200 Subject: [PATCH] Issue #456 - Move new TidyMetaCharset to clean --- src/clean.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++- src/clean.h | 1 + src/lexer.c | 193 -------------------------------------------------- 3 files changed, 197 insertions(+), 194 deletions(-) diff --git a/src/clean.c b/src/clean.c index 8db77c7..0abf53a 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2208,7 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent ) } #endif -/* Issue #456 - This is discarded */ +/* Issue #456 - This is discarded + See replacement TidyMetaCharset */ #if 0 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) { @@ -2287,6 +2288,200 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) } #endif +/*\ +* Issue #456 - Check meta charset +* 1. if there is no meta charset, it adds one, according to doctype, no warning. +* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required? +* 3. if it doesn't match the output encoding, and fix. Naybe no warning? +* 4. if there are duplicates, discard them, with warning. +\*/ +Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) +{ + AttVal *charsetAttr; + AttVal *contentAttr; + AttVal *httpEquivAttr; + Bool charsetFound = no; + uint outenc = cfg(doc, TidyOutCharEncoding); + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc); + Node *currentNode; + Node *head = TY_(FindHEAD)(doc); + Node *metaTag; + Node *prevNode; + TidyBuffer buf; + TidyBuffer charsetString; + tmbstr httpEquivAttrValue; + tmbstr lcontent; + tmbstr newValue; + /* We can't do anything we don't have a head or encoding is NULL */ + if (!head || !enc || !TY_(tmbstrlen)(enc)) + return no; + if (outenc == RAW) + return no; +#ifndef NO_NATIVE_ISO2022_SUPPORT + if (outenc == ISO2022) + return no; +#endif + + tidyBufInit(&charsetString); + /* Set up the content test 'charset=value' */ + tidyBufClear(&charsetString); + tidyBufAppend(&charsetString, "charset=", 8); + tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc)); + tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */ + /* process the children of the head */ + for (currentNode = head->content; currentNode; currentNode = currentNode->next) + { + if (!nodeIsMETA(currentNode)) + continue; /* not a meta node */ + charsetAttr = attrGetCHARSET(currentNode); + httpEquivAttr = attrGetHTTP_EQUIV(currentNode); + if (!charsetAttr && !httpEquivAttr) + continue; /* has no charset attribute */ + /* + Meta charset comes in quite a few flavors: + 1. - expected for (X)HTML5. + */ + if (charsetAttr && !httpEquivAttr) + { + /* we already found one, so remove the rest. */ + if (charsetFound || !charsetAttr->value) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + charsetFound = yes; + /* Fix mismatched attribute value */ + if (TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) + { + newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */ + TY_(tmbstrcpy)(newValue, enc); + /* Note: previously http-equiv had been modified, without warning + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); + */ + TidyDocFree(doc, charsetAttr->value); /* free current value */ + charsetAttr->value = newValue; + } + /* Make sure it's the first element. */ + if (currentNode != head->content->next) { + TY_(RemoveNode)(currentNode); + TY_(InsertNodeAtStart)(head, currentNode); + } + continue; + } + /* + 2. + expected for HTML4. This is normally ok - but can clash. + */ + if (httpEquivAttr && !charsetAttr) + { + contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); + if (!contentAttr) + continue; /* has no 'content' attribute */ + if (!httpEquivAttr->value) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); + if (TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) + continue; /* is not 'content-type' */ + if (!contentAttr->value) + { + prevNode = currentNode->prev; + /* maybe need better message here */ + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + /* check encoding matches + If a miss-match found here, fix it. previous silently done + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + */ + lcontent = TY_(tmbstrtolower)(contentAttr->value); + if (TY_(tmbsubstr)(lcontent, charsetString.bp)) + { + /* we already found one, so remove the rest. */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + charsetFound = yes; + } + else + { + /* fix a mis-match */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + } + else + { + /* correct the content */ + newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1); + TidyDocFree(doc, contentAttr->value); + TY_(tmbstrcpy)(newValue, "text/html; charset="); + TY_(tmbstrcpy)(newValue + 19, enc); + contentAttr->value = newValue; + charsetFound = yes; + } + } + continue; + } + /* + 3. + This is generally bad. Discard and warn. + */ + if (httpEquivAttr && charsetAttr) + { + /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */ + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + } + } + + /* completed head scan - add appropriate meta - if 'yes' and none exists */ + if (cfgBool(doc, TidyMetaCharset) && !charsetFound) + { + /* add appropriate meta charset tag - no warning */ + metaTag = TY_(InferredTag)(doc, TidyTag_META); + switch (TY_(HTMLVersion)(doc)) + { + case HT50: + case XH50: + TY_(AddAttribute)(doc, metaTag, "charset", enc); + break; + default: + tidyBufInit(&buf); + tidyBufAppend(&buf, "text/html; ", 11); + tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp)); + tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */ + TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); + tidyBufFree(&buf); + } + TY_(InsertNodeAtStart)(head, metaTag); + } + tidyBufFree(&charsetString); + return yes; +} + + void TY_(DropComments)(TidyDocImpl* doc, Node* node) { Node* next; diff --git a/src/clean.h b/src/clean.h index d5d4117..e538bcf 100644 --- a/src/clean.h +++ b/src/clean.h @@ -67,6 +67,7 @@ void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent ); #if 0 void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent ); #endif +Bool TY_(TidyMetaCharset)(TidyDocImpl* doc); void TY_(DropComments)(TidyDocImpl* doc, Node* node); void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode); diff --git a/src/lexer.c b/src/lexer.c index b3832d9..238fbfa 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1827,199 +1827,6 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) return node; } -/*\ - * Issue #456 - Check meta charset - * 1. if there is no meta charset, it adds one, according to doctype, no warning. - * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required? - * 3. if it doesn't match the output encoding, and fix. Naybe no warning? - * 4. if there are duplicates, discard them, with warning. -\*/ -Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) -{ - AttVal *charsetAttr; - AttVal *contentAttr; - AttVal *httpEquivAttr; - Bool charsetFound = no; - uint outenc = cfg(doc, TidyOutCharEncoding); - ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc); - Node *currentNode; - Node *head = TY_(FindHEAD)( doc ); - Node *metaTag; - Node *prevNode; - TidyBuffer buf; - TidyBuffer charsetString; - tmbstr httpEquivAttrValue; - tmbstr lcontent; - tmbstr newValue; - /* We can't do anything we don't have a head or encoding is NULL */ - if( !head || !enc || !TY_(tmbstrlen)(enc)) - return no; - if (outenc == RAW) - return no; -#ifndef NO_NATIVE_ISO2022_SUPPORT - if (outenc == ISO2022) - return no; -#endif - - tidyBufInit(&charsetString); - /* Set up the content test 'charset=value' */ - tidyBufClear(&charsetString); - tidyBufAppend(&charsetString, "charset=", 8); - tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc)); - tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */ - /* process the children of the head */ - for (currentNode = head->content; currentNode; currentNode = currentNode->next) - { - if (!nodeIsMETA(currentNode)) - continue; /* not a meta node */ - charsetAttr = attrGetCHARSET(currentNode); - httpEquivAttr = attrGetHTTP_EQUIV(currentNode); - if(!charsetAttr && !httpEquivAttr) - continue; /* has no charset attribute */ - /* - Meta charset comes in quite a few flavors: - 1. - expected for (X)HTML5. - */ - if (charsetAttr && !httpEquivAttr) - { - /* we already found one, so remove the rest. */ - if(charsetFound || !charsetAttr->value) - { - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)( doc, currentNode ); - currentNode = prevNode; - continue; - } - charsetFound = yes; - /* Fix mismatched attribute value */ - if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) - { - newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */ - TY_(tmbstrcpy)( newValue, enc ); - /* Note: previously http-equiv had been modified, without warning - in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) - TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); - */ - TidyDocFree(doc, charsetAttr->value); /* free current value */ - charsetAttr->value = newValue; - } - /* Make sure it's the first element. */ - if ( currentNode != head->content->next ){ - TY_(RemoveNode)( currentNode ); - TY_(InsertNodeAtStart)( head, currentNode ); - } - continue; - } - /* - 2. - expected for HTML4. This is normally ok - but can clash. - */ - if(httpEquivAttr && !charsetAttr) - { - contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); - if (!contentAttr) - continue; /* has no 'content' attribute */ - if (!httpEquivAttr->value) - { - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - continue; - } - httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); - if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) - continue; /* is not 'content-type' */ - if (!contentAttr->value) - { - prevNode = currentNode->prev; - /* maybe need better message here */ - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - continue; - } - /* check encoding matches - If a miss-match found here, fix it. previous silently done - in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) - */ - lcontent = TY_(tmbstrtolower)(contentAttr->value); - if (TY_(tmbsubstr)(lcontent, charsetString.bp)) - { - /* we already found one, so remove the rest. */ - if (charsetFound) - { - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - continue; - } - charsetFound = yes; - } - else - { - /* fix a mis-match */ - if (charsetFound) - { - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - } - else - { - /* correct the content */ - newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1); - TidyDocFree(doc, contentAttr->value); - TY_(tmbstrcpy)(newValue, "text/html; charset="); - TY_(tmbstrcpy)(newValue + 19, enc); - contentAttr->value = newValue; - charsetFound = yes; - } - } - continue; - } - /* - 3. - This is generally bad. Discard and warn. - */ - if(httpEquivAttr && charsetAttr) - { - /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */ - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - } - } - - /* completed head scan - add appropriate meta - if 'yes' and none exists */ - if (cfgBool(doc, TidyMetaCharset) && !charsetFound) - { - /* add appropriate meta charset tag - no warning */ - metaTag = TY_(InferredTag)(doc, TidyTag_META); - switch (TY_(HTMLVersion)(doc)) - { - case HT50: - case XH50: - TY_(AddAttribute)(doc, metaTag, "charset", enc); - break; - default: - tidyBufInit(&buf); - tidyBufAppend(&buf, "text/html; ", 11); - tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp)); - tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */ - TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); - tidyBufFree(&buf); - } - TY_(InsertNodeAtStart)(head, metaTag); - } - tidyBufFree(&charsetString); - return yes; -} - /* add meta element for Tidy */ Bool TY_(AddGenerator)( TidyDocImpl* doc ) {