From 040c22c6dc26a81d30832ebcc5be91baf983dd49 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Tue, 4 Oct 2016 16:13:05 +1100 Subject: [PATCH] Part 2 - Implement lexer logic --- src/lexer.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index ffc4394..2b73604 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1674,14 +1674,98 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) return node; } -/* Check meta charset*/ -Bool TY_(TidyMetaCharset)( TidyDocImpl* doc ) +/* Check meta charset + 1. if there is no meta charset, it adds one. + 2. if there is a meta charset, it moves it to the top if HEAD. + 3. if it doesn't match the output encoding, warn about that. + 4. if there are duplicates, discard them. + */ +Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) { - AttVal *attval; - Node *node; Node *head = TY_(FindHEAD)( doc ); - printf("hello"); - return no; + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); + Bool charsetFound = no; + // We can't do anything we don't have a head or encoding is NULL + if( !head || !enc ) + return no; + + for (Node *node = head->content; node; node = node->next) + { + if (!nodeIsMETA(node)) + continue; + AttVal *charsetAttr = TY_(AttrGetById)(node, TidyAttr_CHARSET); + AttVal *httpEquivAttr = TY_(AttrGetById)(node, TidyAttr_HTTP_EQUIV); + if(!charsetAttr && !httpEquivAttr) + continue; + + // Meta charset comes in quite a few flavors: + // 1. - expected for (X)HTML5. + if (charsetAttr && !httpEquivAttr) + { + // we already found one + if(charsetFound) + { + TY_(DiscardElement)( doc, node ); + printf("WARNING ABOUT DISCARDING ELEMENT \n"); + continue; + } + charsetFound = yes; + tmbstr lCharset = TY_(tmbstrtolower)(charsetAttr->value); + if(strcmp(lCharset, enc) == 0) + { + // Move it to head + TY_(RemoveNode)( node ); + TY_(InsertNodeAtStart)( head, node ); + } + else + { + printf("WARN ABOUT MISMATCH: %s not match output %s \n", lCharset, enc); + TY_(RemoveNode)( node ); + TY_(InsertNodeAtStart)( head, node ); + } + continue; + } + + // 2. + // expected for HTML4. This is normally ok - but can clash. + if(httpEquivAttr && !charsetAttr) + { + AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); + tmbstr lvalue = TY_(tmbstrtolower)(httpEquivAttr->value); + if(!contentAttr || strcmp(lvalue, "content-type") != 0) + continue; + tmbstr lcontent = TY_(tmbstrtolower)(contentAttr->value); + char expected[sizeof(enc) + 8] = "charset="; + strcat(expected, enc); + if(TY_(tmbsubstr)(lcontent, expected)){ + printf("WARN ABOUT CLASH: %s \n", contentAttr->value); + } + } + // 3. + // This is generally bad. + if(httpEquivAttr && charsetAttr) + { + printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); + } + } + if(charsetFound){ + return yes; + } + Node *node = TY_(InferredTag)(doc, TidyTag_META); + switch(TY_(HTMLVersion)(doc)) + { + case HT50: + case XH50: + TY_(AddAttribute)( doc, node, "charset", enc); + break; + default: + TY_(AddAttribute)( doc, node, "http-equiv", "content-type"); + TY_(AddAttribute)( doc, node, "content", "text/html; charset="); + AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); + TY_(tmbstrcat)(contentAttr->value, enc); + } + TY_(InsertNodeAtStart)( head, node ); + return yes; } /* add meta element for Tidy */