fix(lexer.c): fixes from initial review

This commit is contained in:
Marcos Caceres 2016-10-17 17:00:58 +11:00
parent 523d58b004
commit aff76bec38

View file

@ -1682,72 +1682,86 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
*/ */
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
{ {
Node *head = TY_(FindHEAD)( doc ); AttVal *charsetAttr;
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); AttVal *contentAttr;
AttVal *httpEquivAttr;
Bool charsetFound = no; Bool charsetFound = no;
// We can't do anything we don't have a head or encoding is NULL ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
Node *currentNode;
Node *head = TY_(FindHEAD)( doc );
Node *metaTag;
Node *prevNode;
TidyBuffer buf;
TidyBuffer charsetString;
tmbstr httpEquivAttrValue;
tmbstr lcontent;
tmbstr newValue;
/* We can't do anything we don't have a head or encoding is NULL */
if( !head || !enc ) if( !head || !enc )
return no; return no;
tidyBufInit(&charsetString);
for (Node *node = head->content; node; node = node->next) for (currentNode = head->content; currentNode; currentNode = currentNode->next)
{ {
if (!nodeIsMETA(node)) if (!nodeIsMETA(currentNode))
continue; continue;
AttVal *charsetAttr = attrGetCHARSET(node); charsetAttr = attrGetCHARSET(currentNode);
AttVal *httpEquivAttr = attrGetHTTP_EQUIV(node); httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
if(!charsetAttr && !httpEquivAttr) if(!charsetAttr && !httpEquivAttr)
continue; continue;
// Meta charset comes in quite a few flavors: /*
// 1. <meta charset=value> - expected for (X)HTML5. Meta charset comes in quite a few flavors:
1. <meta charset=value> - expected for (X)HTML5.
*/
if (charsetAttr && !httpEquivAttr) if (charsetAttr && !httpEquivAttr)
{ {
// we already found one, so remove the rest. // we already found one, so remove the rest.
if(charsetFound) if(charsetFound)
{ {
Node *prevNode = node->prev; prevNode = currentNode->prev;
TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED); TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)( doc, node ); TY_(DiscardElement)( doc, currentNode );
node = prevNode; currentNode = prevNode;
continue; continue;
} }
charsetFound = yes; charsetFound = yes;
tmbstr lCharset = TY_(tmbstrtolower)(charsetAttr->value);
// Fix mismatched attribute value // Fix mismatched attribute value
if(strcmp(lCharset, enc) != 0) if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
{ {
tmbstr newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
TY_(tmbstrcpy)( newValue, enc ); TY_(tmbstrcpy)( newValue, enc );
TY_(ReportAttrError)( doc, node, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
charsetAttr->value = newValue; charsetAttr->value = newValue;
} }
// Make sure it's the first element. // Make sure it's the first element.
if ( node != head->content->next ){ if ( currentNode != head->content->next ){
TY_(RemoveNode)( node ); TY_(RemoveNode)( currentNode );
TY_(InsertNodeAtStart)( head, node ); TY_(InsertNodeAtStart)( head, currentNode );
} }
continue; continue;
} }
/*
// 2. <meta http-equiv="content-type" content="text/html; charset=UTF-8"> 2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
// expected for HTML4. This is normally ok - but can clash. expected for HTML4. This is normally ok - but can clash.
*/
if(httpEquivAttr && !charsetAttr) if(httpEquivAttr && !charsetAttr)
{ {
AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); tidyBufClear(&charsetString);
tmbstr lvalue = TY_(tmbstrtolower)(httpEquivAttr->value); tidyBufAppend(&charsetString, "charset=", 8);
if(!contentAttr || strcmp(lvalue, "content-type") != 0) tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
continue; continue;
tmbstr lcontent = TY_(tmbstrtolower)(contentAttr->value); lcontent = TY_(tmbstrtolower)(contentAttr->value);
char* charsetString = "charset="; if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
char* expected = calloc(strlen(enc) + strlen(charsetString) + 1, sizeof(char*));
strcat(expected, charsetString);
strcat(expected, enc);
if(TY_(tmbsubstr)(lcontent, expected)){
printf("WARN ABOUT CLASH: %s \n", contentAttr->value); printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
} }
free(expected);
} }
// 3. <meta charset="utf-8" http-equiv="Content-Type" content="..."> /*
// This is generally bad. 3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
This is generally bad.
*/
if(httpEquivAttr && charsetAttr) if(httpEquivAttr && charsetAttr)
{ {
printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
@ -1756,20 +1770,22 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
if(charsetFound){ if(charsetFound){
return yes; return yes;
} }
Node *node = TY_(InferredTag)(doc, TidyTag_META); metaTag = TY_(InferredTag)(doc, TidyTag_META);
switch(TY_(HTMLVersion)(doc)) switch(TY_(HTMLVersion)(doc))
{ {
case HT50: case HT50:
case XH50: case XH50:
TY_(AddAttribute)( doc, node, "charset", enc); TY_(AddAttribute)( doc, metaTag, "charset", enc);
break; break;
default: default:
TY_(AddAttribute)( doc, node, "http-equiv", "content-type"); tidyBufInit(&buf);
TY_(AddAttribute)( doc, node, "content", "text/html; charset="); tidyBufAppend(&buf, "text/html; charset=", 19);
AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
TY_(tmbstrcat)(contentAttr->value, enc); TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp);
tidyBufFree(&buf);
} }
TY_(InsertNodeAtStart)( head, node ); TY_(InsertNodeAtStart)( head, metaTag );
tidyBufFree(&charsetString);
return yes; return yes;
} }