diff --git a/src/lexer.c b/src/lexer.c index 0a48e53..7f3d683 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1682,72 +1682,86 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) */ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) { - Node *head = TY_(FindHEAD)( doc ); - ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); + AttVal *charsetAttr; + AttVal *contentAttr; + AttVal *httpEquivAttr; Bool charsetFound = no; - // We can't do anything we don't have a head or encoding is NULL + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); + Node *currentNode; + Node *head = TY_(FindHEAD)( doc ); + Node *metaTag; + Node *prevNode; + TidyBuffer buf; + TidyBuffer charsetString; + tmbstr httpEquivAttrValue; + tmbstr lcontent; + tmbstr newValue; + /* We can't do anything we don't have a head or encoding is NULL */ if( !head || !enc ) return no; - - for (Node *node = head->content; node; node = node->next) + tidyBufInit(&charsetString); + for (currentNode = head->content; currentNode; currentNode = currentNode->next) { - if (!nodeIsMETA(node)) + if (!nodeIsMETA(currentNode)) continue; - AttVal *charsetAttr = attrGetCHARSET(node); - AttVal *httpEquivAttr = attrGetHTTP_EQUIV(node); + charsetAttr = attrGetCHARSET(currentNode); + httpEquivAttr = attrGetHTTP_EQUIV(currentNode); if(!charsetAttr && !httpEquivAttr) continue; - // Meta charset comes in quite a few flavors: - // 1. - expected for (X)HTML5. + /* + Meta charset comes in quite a few flavors: + 1. - expected for (X)HTML5. + */ if (charsetAttr && !httpEquivAttr) { // we already found one, so remove the rest. if(charsetFound) { - Node *prevNode = node->prev; - TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED); - TY_(DiscardElement)( doc, node ); - node = prevNode; + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)( doc, currentNode ); + currentNode = prevNode; continue; } charsetFound = yes; - tmbstr lCharset = TY_(tmbstrtolower)(charsetAttr->value); // Fix mismatched attribute value - if(strcmp(lCharset, enc) != 0) + if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) { - tmbstr newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); + newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); TY_(tmbstrcpy)( newValue, enc ); - TY_(ReportAttrError)( doc, node, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); + TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); charsetAttr->value = newValue; } // Make sure it's the first element. - if ( node != head->content->next ){ - TY_(RemoveNode)( node ); - TY_(InsertNodeAtStart)( head, node ); + if ( currentNode != head->content->next ){ + TY_(RemoveNode)( currentNode ); + TY_(InsertNodeAtStart)( head, currentNode ); } continue; } - - // 2. - // expected for HTML4. This is normally ok - but can clash. + /* + 2. + expected for HTML4. This is normally ok - but can clash. + */ if(httpEquivAttr && !charsetAttr) { - AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); - tmbstr lvalue = TY_(tmbstrtolower)(httpEquivAttr->value); - if(!contentAttr || strcmp(lvalue, "content-type") != 0) + tidyBufClear(&charsetString); + tidyBufAppend(&charsetString, "charset=", 8); + tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc )); + contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); + httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); + + if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) continue; - tmbstr lcontent = TY_(tmbstrtolower)(contentAttr->value); - char* charsetString = "charset="; - char* expected = calloc(strlen(enc) + strlen(charsetString) + 1, sizeof(char*)); - strcat(expected, charsetString); - strcat(expected, enc); - if(TY_(tmbsubstr)(lcontent, expected)){ + lcontent = TY_(tmbstrtolower)(contentAttr->value); + if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){ printf("WARN ABOUT CLASH: %s \n", contentAttr->value); } - free(expected); } - // 3. - // This is generally bad. + /* + 3. + This is generally bad. + */ if(httpEquivAttr && charsetAttr) { printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); @@ -1756,20 +1770,22 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) if(charsetFound){ return yes; } - Node *node = TY_(InferredTag)(doc, TidyTag_META); + metaTag = TY_(InferredTag)(doc, TidyTag_META); switch(TY_(HTMLVersion)(doc)) { case HT50: case XH50: - TY_(AddAttribute)( doc, node, "charset", enc); + TY_(AddAttribute)( doc, metaTag, "charset", enc); break; default: - TY_(AddAttribute)( doc, node, "http-equiv", "content-type"); - TY_(AddAttribute)( doc, node, "content", "text/html; charset="); - AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); - TY_(tmbstrcat)(contentAttr->value, enc); + tidyBufInit(&buf); + tidyBufAppend(&buf, "text/html; charset=", 19); + tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc)); + TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp); + tidyBufFree(&buf); } - TY_(InsertNodeAtStart)( head, node ); + TY_(InsertNodeAtStart)( head, metaTag ); + tidyBufFree(&charsetString); return yes; }