From 6ebd12be67101df684ce468474619a5ef15a728b Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Sun, 14 May 2017 19:08:29 +0200 Subject: [PATCH] Issue #456 - More work on this option --- src/clean.c | 3 + src/clean.h | 4 +- src/lexer.c | 160 +++++++++++++++++++++++++++++++++++++------------- src/tidylib.c | 10 +--- 4 files changed, 126 insertions(+), 51 deletions(-) diff --git a/src/clean.c b/src/clean.c index 779ddec..8db77c7 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2208,6 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent ) } #endif +/* Issue #456 - This is discarded */ +#if 0 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) { Node *pNode; @@ -2283,6 +2285,7 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) pLastProp = NULL; } } +#endif void TY_(DropComments)(TidyDocImpl* doc, Node* node) { diff --git a/src/clean.h b/src/clean.h index 00d4923..d5d4117 100644 --- a/src/clean.h +++ b/src/clean.h @@ -63,8 +63,10 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html ); #if 0 void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent ); #endif - +/* Issue #456 - This is discarded */ +#if 0 void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent ); +#endif void TY_(DropComments)(TidyDocImpl* doc, Node* node); void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode); diff --git a/src/lexer.c b/src/lexer.c index c2773dc..b3832d9 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1827,19 +1827,21 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) return node; } -/* Check meta charset - 1. if there is no meta charset, it adds one. - 2. if there is a meta charset, it moves it to the top if HEAD. - 3. if it doesn't match the output encoding, warn about that. - 4. if there are duplicates, discard them. - */ +/*\ + * Issue #456 - Check meta charset + * 1. if there is no meta charset, it adds one, according to doctype, no warning. + * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required? + * 3. if it doesn't match the output encoding, and fix. Naybe no warning? + * 4. if there are duplicates, discard them, with warning. +\*/ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) { AttVal *charsetAttr; AttVal *contentAttr; AttVal *httpEquivAttr; Bool charsetFound = no; - ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); + uint outenc = cfg(doc, TidyOutCharEncoding); + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc); Node *currentNode; Node *head = TY_(FindHEAD)( doc ); Node *metaTag; @@ -1850,25 +1852,38 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) tmbstr lcontent; tmbstr newValue; /* We can't do anything we don't have a head or encoding is NULL */ - if( !head || !enc ) + if( !head || !enc || !TY_(tmbstrlen)(enc)) return no; + if (outenc == RAW) + return no; +#ifndef NO_NATIVE_ISO2022_SUPPORT + if (outenc == ISO2022) + return no; +#endif + tidyBufInit(&charsetString); + /* Set up the content test 'charset=value' */ + tidyBufClear(&charsetString); + tidyBufAppend(&charsetString, "charset=", 8); + tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc)); + tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */ + /* process the children of the head */ for (currentNode = head->content; currentNode; currentNode = currentNode->next) { if (!nodeIsMETA(currentNode)) - continue; + continue; /* not a meta node */ charsetAttr = attrGetCHARSET(currentNode); httpEquivAttr = attrGetHTTP_EQUIV(currentNode); if(!charsetAttr && !httpEquivAttr) - continue; + continue; /* has no charset attribute */ /* Meta charset comes in quite a few flavors: - 1. - expected for (X)HTML5. + 1. - expected for (X)HTML5. */ if (charsetAttr && !httpEquivAttr) { - // we already found one, so remove the rest. - if(charsetFound) + /* we already found one, so remove the rest. */ + if(charsetFound || !charsetAttr->value) { prevNode = currentNode->prev; TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); @@ -1877,15 +1892,19 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) continue; } charsetFound = yes; - // Fix mismatched attribute value + /* Fix mismatched attribute value */ if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) { - newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); + newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */ TY_(tmbstrcpy)( newValue, enc ); - TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); + /* Note: previously http-equiv had been modified, without warning + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); + */ + TidyDocFree(doc, charsetAttr->value); /* free current value */ charsetAttr->value = newValue; } - // Make sure it's the first element. + /* Make sure it's the first element. */ if ( currentNode != head->content->next ){ TY_(RemoveNode)( currentNode ); TY_(InsertNodeAtStart)( head, currentNode ); @@ -1893,51 +1912,110 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) continue; } /* - 2. - expected for HTML4. This is normally ok - but can clash. + 2. + expected for HTML4. This is normally ok - but can clash. */ if(httpEquivAttr && !charsetAttr) { - tidyBufClear(&charsetString); - tidyBufAppend(&charsetString, "charset=", 8); - tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc )); contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); - httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); - - if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) + if (!contentAttr) + continue; /* has no 'content' attribute */ + if (!httpEquivAttr->value) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; continue; - lcontent = TY_(tmbstrtolower)(contentAttr->value); - if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){ - printf("WARN ABOUT CLASH: %s \n", contentAttr->value); } + httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); + if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) + continue; /* is not 'content-type' */ + if (!contentAttr->value) + { + prevNode = currentNode->prev; + /* maybe need better message here */ + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + /* check encoding matches + If a miss-match found here, fix it. previous silently done + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + */ + lcontent = TY_(tmbstrtolower)(contentAttr->value); + if (TY_(tmbsubstr)(lcontent, charsetString.bp)) + { + /* we already found one, so remove the rest. */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + charsetFound = yes; + } + else + { + /* fix a mis-match */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + } + else + { + /* correct the content */ + newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1); + TidyDocFree(doc, contentAttr->value); + TY_(tmbstrcpy)(newValue, "text/html; charset="); + TY_(tmbstrcpy)(newValue + 19, enc); + contentAttr->value = newValue; + charsetFound = yes; + } + } + continue; } /* - 3. - This is generally bad. + 3. + This is generally bad. Discard and warn. */ if(httpEquivAttr && charsetAttr) { - printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); + /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */ + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; } } - if(charsetFound){ - return yes; - } - metaTag = TY_(InferredTag)(doc, TidyTag_META); - switch(TY_(HTMLVersion)(doc)) + + /* completed head scan - add appropriate meta - if 'yes' and none exists */ + if (cfgBool(doc, TidyMetaCharset) && !charsetFound) { + /* add appropriate meta charset tag - no warning */ + metaTag = TY_(InferredTag)(doc, TidyTag_META); + switch (TY_(HTMLVersion)(doc)) + { case HT50: case XH50: - TY_(AddAttribute)( doc, metaTag, "charset", enc); + TY_(AddAttribute)(doc, metaTag, "charset", enc); break; default: tidyBufInit(&buf); - tidyBufAppend(&buf, "text/html; charset=", 19); - tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc)); - TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp); + tidyBufAppend(&buf, "text/html; ", 11); + tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp)); + tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */ + TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); tidyBufFree(&buf); + } + TY_(InsertNodeAtStart)(head, metaTag); } - TY_(InsertNodeAtStart)( head, metaTag ); tidyBufFree(&charsetString); return yes; } diff --git a/src/tidylib.c b/src/tidylib.c index 811721b..e2c443c 100755 --- a/src/tidylib.c +++ b/src/tidylib.c @@ -1992,7 +1992,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut ); Bool xmlDecl = cfgBool( doc, TidyXmlDecl ); Bool tidyMark = cfgBool( doc, TidyMark ); - Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset); Bool tidyXmlTags = cfgBool( doc, TidyXmlTags ); Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName ); Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis ); @@ -2044,12 +2043,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) #endif /* Reconcile http-equiv meta element with output encoding */ - if (cfg( doc, TidyOutCharEncoding) != RAW -#ifndef NO_NATIVE_ISO2022_SUPPORT - && cfg( doc, TidyOutCharEncoding) != ISO2022 -#endif - ) - TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc )); + TY_(TidyMetaCharset)(doc); if ( !TY_(CheckNodeIntegrity)( &doc->root ) ) TidyPanic( doc->allocator, integrity ); @@ -2097,8 +2091,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) if (tidyMark ) TY_(AddGenerator)(doc); - if (tidyMetaCharset) - TY_(TidyMetaCharset)(doc); } /* ensure presence of initial */