From 09e310b50c6a0c176cb3bf7e5fd8a64c52a08828 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Wed, 20 Jun 2012 16:48:12 +0900 Subject: [PATCH] -gdoc opt, clean Google Docs HTML; fr Dave Raggett --- src/gdoc.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/gdoc.h | 26 ++++++++ 2 files changed, 206 insertions(+) create mode 100644 src/gdoc.c create mode 100644 src/gdoc.h diff --git a/src/gdoc.c b/src/gdoc.c new file mode 100644 index 0000000..62409dc --- /dev/null +++ b/src/gdoc.c @@ -0,0 +1,180 @@ +/* + clean.c -- clean up misuse of presentation markup + + (c) 1998-2008 (W3C) MIT, ERCIM, Keio University + See tidy.h for the copyright notice. + + CVS Info : + + $Author: arnaud02 $ + $Date: 2008/10/14 12:18:10 $ + $Revision: 1.111 $ + + Filters from other formats such as Microsoft Word + often make excessive use of presentation markup such + as font tags, B, I, and the align attribute. By applying + a set of production rules, it is straight forward to + transform this to use CSS. + + Some rules replace some of the children of an element by + style properties on the element, e.g. + +

...

->

...

+ + Such rules are applied to the element's content and then + to the element itself until none of the rules more apply. + Having applied all the rules to an element, it will have + a style attribute with one or more properties. + + Other rules strip the element they apply to, replacing + it by style properties on the contents, e.g. + +
  • ...

  • ->

    ... + + These rules are applied to an element before processing + its content and replace the current element by the first + element in the exposed content. + + After applying both sets of rules, you can replace the + style attribute by a class value and style rule in the + document head. To support this, an association of styles + and class names is built. + + A naive approach is to rely on string matching to test + when two property lists are the same. A better approach + would be to first sort the properties before matching. + +*/ + +#include +#include +#include + +#include "tidy-int.h" +#include "gdoc.h" +#include "lexer.h" +#include "parser.h" +#include "tags.h" +#include "attrs.h" +#include "message.h" +#include "tmbstr.h" +#include "utf8.h" + +/* + Extricate "element", replace it by its content and delete it. +*/ +static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode) +{ + if (element->content) + { + Node *node, *parent = element->parent; + + element->last->next = element->next; + + if (element->next) + { + element->next->prev = element->last; + } + else + parent->last = element->last; + + if (element->prev) + { + element->content->prev = element->prev; + element->prev->next = element->content; + } + else + parent->content = element->content; + + for (node = element->content; node; node = node->next) + node->parent = parent; + + *pnode = element->content; + + element->next = element->content = NULL; + TY_(FreeNode)(doc, element); + } + else + { + *pnode = TY_(DiscardElement)(doc, element); + } +} + +static void CleanNode( TidyDocImpl* doc, Node *node ) +{ + Node *child, *next; + + if (node->content) + { + for (child = node->content; child != NULL; child = next) + { + next = child->next; + + if (TY_(nodeIsElement)(child)) + { + if (nodeIsSTYLE(child)) + TY_(DiscardElement)(doc, child); + if (nodeIsP(child) && !child->content) + TY_(DiscardElement)(doc, child); + else if (nodeIsSPAN(child)) + DiscardContainer( doc, child, &next); + else if (nodeIsA(child) && !child->content) + { + AttVal *id = TY_(GetAttrByName)( child, "name" ); + + if (id) + TY_(RepairAttrValue)( doc, child->parent, "id", id->value ); + + TY_(DiscardElement)(doc, child); + } + else + { + if (child->attributes) + TY_(DropAttrByName)( doc, child, "class" ); + + CleanNode(doc, child); + } + } + } + } +} + +/* insert meta element to force browser to recognize doc as UTF8 */ +static void SetUTF8( TidyDocImpl* doc ) +{ + Node *head = TY_(FindHEAD)( doc ); + + if (head) + { + Node *node = TY_(InferredTag)(doc, TidyTag_META); + TY_(AddAttribute)( doc, node, "http-equiv", "Content-Type" ); + TY_(AddAttribute)( doc, node, "content", "text/html; charset=UTF-8" ); + TY_(InsertNodeAtStart)( head, node ); + } +} + +/* clean html exported by Google Docs + + - strip the script element, as the style sheet is a mess + - strip class attributes + - strip span elements, leaving their content in place + - replace by id on parent element + - strip empty

    elements +*/ +void TY_(CleanGoogleDocument)( TidyDocImpl* doc ) +{ + /* placeholder. CleanTree()/CleanNode() will not + ** zap root element + */ + CleanNode( doc, &doc->root ); + SetUTF8( doc ); +} + +/* + * local variables: + * mode: c + * indent-tabs-mode: nil + * c-basic-offset: 4 + * eval: (c-set-offset 'substatement-open 0) + * end: + */ diff --git a/src/gdoc.h b/src/gdoc.h new file mode 100644 index 0000000..22bb248 --- /dev/null +++ b/src/gdoc.h @@ -0,0 +1,26 @@ +#ifndef __GDOC_H__ +#define __GDOC_H__ + +/* gdoc.h -- clean up html exported by Google Docs + + (c) 2012 (W3C) MIT, ERCIM, Keio University + See tidy.h for the copyright notice. + + CVS Info: + $Author: dsr $ + $Date: 2012/06/07 13:14:00 $ + $Revision: 1.0 $ + +*/ + +/* clean html exported by Google Docs + + - strip the script element, as the style sheet is a mess + - strip class attributes + - strip span elements, leaving their content in place + - replace by id on parent element + - strip empty

    elements +*/ +void TY_(CleanGoogleDocument)( TidyDocImpl* doc ); + +#endif /* __GDOC_H__ */