diff --git a/src/gdoc.c b/src/gdoc.c new file mode 100644 index 0000000..62409dc --- /dev/null +++ b/src/gdoc.c @@ -0,0 +1,180 @@ +/* + clean.c -- clean up misuse of presentation markup + + (c) 1998-2008 (W3C) MIT, ERCIM, Keio University + See tidy.h for the copyright notice. + + CVS Info : + + $Author: arnaud02 $ + $Date: 2008/10/14 12:18:10 $ + $Revision: 1.111 $ + + Filters from other formats such as Microsoft Word + often make excessive use of presentation markup such + as font tags, B, I, and the align attribute. By applying + a set of production rules, it is straight forward to + transform this to use CSS. + + Some rules replace some of the children of an element by + style properties on the element, e.g. + +
...
->...
+ + Such rules are applied to the element's content and then + to the element itself until none of the rules more apply. + Having applied all the rules to an element, it will have + a style attribute with one or more properties. + + Other rules strip the element they apply to, replacing + it by style properties on the contents, e.g. + +...
...
+
+ These rules are applied to an element before processing
+ its content and replace the current element by the first
+ element in the exposed content.
+
+ After applying both sets of rules, you can replace the
+ style attribute by a class value and style rule in the
+ document head. To support this, an association of styles
+ and class names is built.
+
+ A naive approach is to rely on string matching to test
+ when two property lists are the same. A better approach
+ would be to first sort the properties before matching.
+
+*/
+
+#include elements
+*/
+void TY_(CleanGoogleDocument)( TidyDocImpl* doc )
+{
+ /* placeholder. CleanTree()/CleanNode() will not
+ ** zap root element
+ */
+ CleanNode( doc, &doc->root );
+ SetUTF8( doc );
+}
+
+/*
+ * local variables:
+ * mode: c
+ * indent-tabs-mode: nil
+ * c-basic-offset: 4
+ * eval: (c-set-offset 'substatement-open 0)
+ * end:
+ */
diff --git a/src/gdoc.h b/src/gdoc.h
new file mode 100644
index 0000000..22bb248
--- /dev/null
+++ b/src/gdoc.h
@@ -0,0 +1,26 @@
+#ifndef __GDOC_H__
+#define __GDOC_H__
+
+/* gdoc.h -- clean up html exported by Google Docs
+
+ (c) 2012 (W3C) MIT, ERCIM, Keio University
+ See tidy.h for the copyright notice.
+
+ CVS Info:
+ $Author: dsr $
+ $Date: 2012/06/07 13:14:00 $
+ $Revision: 1.0 $
+
+*/
+
+/* clean html exported by Google Docs
+
+ - strip the script element, as the style sheet is a mess
+ - strip class attributes
+ - strip span elements, leaving their content in place
+ - replace by id on parent element
+ - strip empty elements
+*/
+void TY_(CleanGoogleDocument)( TidyDocImpl* doc );
+
+#endif /* __GDOC_H__ */