From a772bbb17f969a8afba3a538648e4404797de545 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Wed, 20 Jun 2012 16:55:42 +0900 Subject: [PATCH] Let's actually commit the -gdoc feature this time. --- .gitignore | 5 +++++ build/gmake/Makefile | 6 +++--- console/tidy.c | 10 ++++++++++ include/tidyenum.h | 1 + quickref.html | 28 +++++++++++++++++++++++++++- src/attrs.c | 21 +++++++++++++++++++++ src/attrs.h | 2 ++ src/config.c | 1 + src/localize.c | 8 ++++++++ src/mappedio.c | 0 src/mappedio.h | 0 src/tidylib.c | 6 ++++++ src/version.h | 2 +- 13 files changed, 85 insertions(+), 5 deletions(-) mode change 100755 => 100644 src/mappedio.c mode change 100755 => 100644 src/mappedio.h diff --git a/.gitignore b/.gitignore index e69f6db..83177cb 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,11 @@ /htmldoc/tidy.1 /htmldoc/quickref.html /lib/ +/autom4te.cache/ +/console/.deps/ +/console/.libs/ +/src/.deps/ +/src/.libs/ *.user *.suo *.sdf diff --git a/build/gmake/Makefile b/build/gmake/Makefile index 8640dff..69e8fda 100644 --- a/build/gmake/Makefile +++ b/build/gmake/Makefile @@ -145,7 +145,7 @@ OBJFILES=\ $(OBJDIR)/attrask$(OBJSUF) $(OBJDIR)/attrdict$(OBJSUF) $(OBJDIR)/attrget$(OBJSUF) \ $(OBJDIR)/buffio$(OBJSUF) $(OBJDIR)/fileio$(OBJSUF) $(OBJDIR)/streamio$(OBJSUF) \ $(OBJDIR)/tagask$(OBJSUF) $(OBJDIR)/tmbstr$(OBJSUF) $(OBJDIR)/utf8$(OBJSUF) \ - $(OBJDIR)/tidylib$(OBJSUF) $(OBJDIR)/mappedio$(OBJSUF) + $(OBJDIR)/tidylib$(OBJSUF) $(OBJDIR)/mappedio$(OBJSUF) $(OBJDIR)/gdoc$(OBJSUF) CFILES= \ $(SRCDIR)/access.c $(SRCDIR)/attrs.c $(SRCDIR)/istack.c \ @@ -155,7 +155,7 @@ CFILES= \ $(SRCDIR)/attrask.c $(SRCDIR)/attrdict.c $(SRCDIR)/attrget.c \ $(SRCDIR)/buffio.c $(SRCDIR)/fileio.c $(SRCDIR)/streamio.c \ $(SRCDIR)/tagask.c $(SRCDIR)/tmbstr.c $(SRCDIR)/utf8.c \ - $(SRCDIR)/tidylib.c $(SRCDIR)/mappedio.c + $(SRCDIR)/tidylib.c $(SRCDIR)/mappedio.c $(SRCDIR)/gdoc.c HFILES= $(INCDIR)/platform.h $(INCDIR)/tidy.h $(INCDIR)/tidyenum.h \ $(INCDIR)/buffio.h @@ -167,7 +167,7 @@ LIBHFILES= \ $(SRCDIR)/mappedio.h $(SRCDIR)/message.h $(SRCDIR)/parser.h \ $(SRCDIR)/pprint.h $(SRCDIR)/streamio.h $(SRCDIR)/tags.h \ $(SRCDIR)/tmbstr.h $(SRCDIR)/utf8.h $(SRCDIR)/tidy-int.h \ - $(SRCDIR)/version.h + $(SRCDIR)/gdoc.h $(SRCDIR)/version.h diff --git a/console/tidy.c b/console/tidy.c index 76d6888..d821bde 100644 --- a/console/tidy.c +++ b/console/tidy.c @@ -181,6 +181,9 @@ static const CmdOptDesc cmdopt_defs[] = { { "-bare", "strip out smart quotes and em dashes, etc.", "bare: yes", CmdOptProcDir, "-b" }, + { "-gdoc", + "produce clean version of html exported by google docs", + "gdoc: yes", CmdOptProcDir, "-g" }, { "-numeric", "output numeric rather than named entities", "numeric-entities: yes", CmdOptProcDir, "-n" }, @@ -1010,6 +1013,9 @@ int main( int argc, char** argv ) else if ( strcasecmp(arg, "clean") == 0 ) tidyOptSetBool( tdoc, TidyMakeClean, yes ); + else if ( strcasecmp(arg, "gdoc") == 0 ) + tidyOptSetBool( tdoc, TidyGDocClean, yes ); + else if ( strcasecmp(arg, "bare") == 0 ) tidyOptSetBool( tdoc, TidyMakeBare, yes ); @@ -1227,6 +1233,10 @@ int main( int argc, char** argv ) tidyOptSetBool( tdoc, TidyMakeClean, yes ); break; + case 'g': + tidyOptSetBool( tdoc, TidyGDocClean, yes ); + break; + case 'b': tidyOptSetBool( tdoc, TidyMakeBare, yes ); break; diff --git a/include/tidyenum.h b/include/tidyenum.h index f253942..d4174e7 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -126,6 +126,7 @@ typedef enum TidyUpperCaseAttrs, /**< Output attributes in upper not lower case */ TidyMakeBare, /**< Make bare HTML: remove Microsoft cruft */ TidyMakeClean, /**< Replace presentational clutter by style rules */ + TidyGDocClean, /**< Clean up HTML exported from Google Docs */ TidyLogicalEmphasis, /**< Replace i by em and b by strong */ TidyDropPropAttrs, /**< Discard proprietary attributes */ TidyDropFontTags, /**< Discard presentation tags */ diff --git a/quickref.html b/quickref.html index a09d0a5..b4f7e3b 100644 --- a/quickref.html +++ b/quickref.html @@ -8,7 +8,7 @@

Quick Reference

HTML Tidy Configuration Options

-

Version: https://github.com/w3c/tidy-html5/tree/d193420

+

Version: https://github.com/w3c/tidy-html5/tree/f212c3f

HTML, XHTML, XML
@@ -188,6 +188,13 @@ Boolean yes + + + gdoc + + Boolean + no + hide-comments @@ -1134,6 +1141,25 @@   + + gdoc + + Top + + + + Type: Boolean
+ Default: no
Example: y/n, yes/no, t/f, true/false, 1/0 + + drop-font-tags + + + + This option specifies if Tidy should enable specific behavior for cleaning up HTML exported fromGoogle Docs. + + +   + hide-comments diff --git a/src/attrs.c b/src/attrs.c index 44abfc7..7472468 100644 --- a/src/attrs.c +++ b/src/attrs.c @@ -751,6 +751,27 @@ AttVal* TY_(GetAttrByName)( Node *node, ctmbstr name ) return attr; } +void TY_(DropAttrByName)( TidyDocImpl* doc, Node *node, ctmbstr name ) +{ + AttVal *attr, *prev = NULL, *next; + + for (attr = node->attributes; attr != NULL; prev = attr, attr = next) + { + next = attr->next; + + if (attr->attribute && TY_(tmbstrcmp)(attr->attribute, name) == 0) + { + if (prev) + prev->next = next; + else + node->attributes = next; + + TY_(FreeAttribute)( doc, attr ); + break; + } + } +} + AttVal* TY_(AddAttribute)( TidyDocImpl* doc, Node *node, ctmbstr name, ctmbstr value ) { diff --git a/src/attrs.h b/src/attrs.h index 7b06ab4..fb9a6af 100644 --- a/src/attrs.h +++ b/src/attrs.h @@ -87,6 +87,8 @@ const Attribute* TY_(FindAttribute)( TidyDocImpl* doc, AttVal *attval ); AttVal* TY_(GetAttrByName)( Node *node, ctmbstr name ); +void TY_(DropAttrByName)( TidyDocImpl* doc, Node *node, ctmbstr name ); + AttVal* TY_(AddAttribute)( TidyDocImpl* doc, Node *node, ctmbstr name, ctmbstr value ); diff --git a/src/config.c b/src/config.c index f557e71..7a952cf 100644 --- a/src/config.c +++ b/src/config.c @@ -252,6 +252,7 @@ static const TidyOptionImpl option_defs[] = { TidyUpperCaseAttrs, MU, "uppercase-attributes", BL, no, ParseBool, boolPicks }, { TidyMakeBare, MU, "bare", BL, no, ParseBool, boolPicks }, { TidyMakeClean, MU, "clean", BL, no, ParseBool, boolPicks }, + { TidyGDocClean, MU, "gdoc", BL, no, ParseBool, boolPicks }, { TidyLogicalEmphasis, MU, "logical-emphasis", BL, no, ParseBool, boolPicks }, { TidyDropPropAttrs, MU, "drop-proprietary-attributes", BL, no, ParseBool, boolPicks }, { TidyDropFontTags, MU, "drop-font-tags", BL, no, ParseBool, boolPicks }, diff --git a/src/localize.c b/src/localize.c index 5cd2063..a5c1d65 100644 --- a/src/localize.c +++ b/src/localize.c @@ -359,6 +359,8 @@ static const TidyOptionId TidyDropFontTagsLinks[] = { TidyMakeClean, TidyUnknownOption }; static const TidyOptionId TidyMakeCleanTagsLinks[] = { TidyDropFontTags, TidyUnknownOption }; +static const TidyOptionId TidyGDocCleanLinks[] = + { TidyMakeClean, TidyUnknownOption }; /* Documentation of options */ static const TidyOptionDoc option_docs[] = @@ -405,6 +407,12 @@ static const TidyOptionDoc option_docs[] = "on the HTML saved by Microsoft Office products. " , TidyMakeCleanTagsLinks }, + {TidyGDocClean, + "This option specifies if Tidy " + "should enable specific behavior for cleaning up HTML exported from" + "Google Docs. " + , TidyMakeCleanTagsLinks + }, {TidyDoctype, "This option specifies the DOCTYPE declaration generated by Tidy.
" "If set to \"omit\" the output won't contain a DOCTYPE declaration.
" diff --git a/src/mappedio.c b/src/mappedio.c old mode 100755 new mode 100644 diff --git a/src/mappedio.h b/src/mappedio.h old mode 100755 new mode 100644 diff --git a/src/tidylib.c b/src/tidylib.c index 0ff8cd6..5756d19 100644 --- a/src/tidylib.c +++ b/src/tidylib.c @@ -29,6 +29,7 @@ #include "tidy-int.h" #include "parser.h" #include "clean.h" +#include "gdoc.h" #include "config.h" #include "message.h" #include "pprint.h" @@ -1238,6 +1239,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) Bool word2K = cfgBool( doc, TidyWord2000 ); Bool logical = cfgBool( doc, TidyLogicalEmphasis ); Bool clean = cfgBool( doc, TidyMakeClean ); + Bool gdoc = cfgBool( doc, TidyGDocClean ); Bool dropFont = cfgBool( doc, TidyDropFontTags ); Bool htmlOut = cfgBool( doc, TidyHtmlOut ); Bool xmlOut = cfgBool( doc, TidyXmlOut ); @@ -1278,6 +1280,10 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) if ( clean || dropFont ) TY_(CleanDocument)( doc ); + /* clean up html exported by Google Focs */ + if ( gdoc ) + TY_(CleanGoogleDocument)( doc ); + /* Move terminating
tags from out of paragraphs */ /*! Do we want to do this for all block-level elements? */ diff --git a/src/version.h b/src/version.h index 88bc5ff..dab783c 100644 --- a/src/version.h +++ b/src/version.h @@ -1 +1 @@ -static const char TY_(release_date)[] = "https://github.com/w3c/tidy-html5/tree/8025154"; \ No newline at end of file +static const char TY_(release_date)[] = "https://github.com/w3c/tidy-html5/tree/45fce5e"; \ No newline at end of file