add another 'cleaner' of html from google docs
This commit is contained in:
parent
39b860b1a7
commit
e0a61e9b06
174
src/gdoc.c
Normal file
174
src/gdoc.c
Normal file
|
@ -0,0 +1,174 @@
|
||||||
|
/*
|
||||||
|
clean.c -- clean up misuse of presentation markup
|
||||||
|
|
||||||
|
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
|
||||||
|
See tidy.h for the copyright notice.
|
||||||
|
|
||||||
|
Filters from other formats such as Microsoft Word
|
||||||
|
often make excessive use of presentation markup such
|
||||||
|
as font tags, B, I, and the align attribute. By applying
|
||||||
|
a set of production rules, it is straight forward to
|
||||||
|
transform this to use CSS.
|
||||||
|
|
||||||
|
Some rules replace some of the children of an element by
|
||||||
|
style properties on the element, e.g.
|
||||||
|
|
||||||
|
<p><b>...</b></p> -> <p style="font-weight: bold">...</p>
|
||||||
|
|
||||||
|
Such rules are applied to the element's content and then
|
||||||
|
to the element itself until none of the rules more apply.
|
||||||
|
Having applied all the rules to an element, it will have
|
||||||
|
a style attribute with one or more properties.
|
||||||
|
|
||||||
|
Other rules strip the element they apply to, replacing
|
||||||
|
it by style properties on the contents, e.g.
|
||||||
|
|
||||||
|
<dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
|
||||||
|
|
||||||
|
These rules are applied to an element before processing
|
||||||
|
its content and replace the current element by the first
|
||||||
|
element in the exposed content.
|
||||||
|
|
||||||
|
After applying both sets of rules, you can replace the
|
||||||
|
style attribute by a class value and style rule in the
|
||||||
|
document head. To support this, an association of styles
|
||||||
|
and class names is built.
|
||||||
|
|
||||||
|
A naive approach is to rely on string matching to test
|
||||||
|
when two property lists are the same. A better approach
|
||||||
|
would be to first sort the properties before matching.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "tidy-int.h"
|
||||||
|
#include "gdoc.h"
|
||||||
|
#include "lexer.h"
|
||||||
|
#include "parser.h"
|
||||||
|
#include "tags.h"
|
||||||
|
#include "attrs.h"
|
||||||
|
#include "message.h"
|
||||||
|
#include "tmbstr.h"
|
||||||
|
#include "utf8.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
Extricate "element", replace it by its content and delete it.
|
||||||
|
*/
|
||||||
|
static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
|
||||||
|
{
|
||||||
|
if (element->content)
|
||||||
|
{
|
||||||
|
Node *node, *parent = element->parent;
|
||||||
|
|
||||||
|
element->last->next = element->next;
|
||||||
|
|
||||||
|
if (element->next)
|
||||||
|
{
|
||||||
|
element->next->prev = element->last;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
parent->last = element->last;
|
||||||
|
|
||||||
|
if (element->prev)
|
||||||
|
{
|
||||||
|
element->content->prev = element->prev;
|
||||||
|
element->prev->next = element->content;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
parent->content = element->content;
|
||||||
|
|
||||||
|
for (node = element->content; node; node = node->next)
|
||||||
|
node->parent = parent;
|
||||||
|
|
||||||
|
*pnode = element->content;
|
||||||
|
|
||||||
|
element->next = element->content = NULL;
|
||||||
|
TY_(FreeNode)(doc, element);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*pnode = TY_(DiscardElement)(doc, element);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void CleanNode( TidyDocImpl* doc, Node *node )
|
||||||
|
{
|
||||||
|
Node *child, *next;
|
||||||
|
|
||||||
|
if (node->content)
|
||||||
|
{
|
||||||
|
for (child = node->content; child != NULL; child = next)
|
||||||
|
{
|
||||||
|
next = child->next;
|
||||||
|
|
||||||
|
if (TY_(nodeIsElement)(child))
|
||||||
|
{
|
||||||
|
if (nodeIsSTYLE(child))
|
||||||
|
TY_(DiscardElement)(doc, child);
|
||||||
|
if (nodeIsP(child) && !child->content)
|
||||||
|
TY_(DiscardElement)(doc, child);
|
||||||
|
else if (nodeIsSPAN(child))
|
||||||
|
DiscardContainer( doc, child, &next);
|
||||||
|
else if (nodeIsA(child) && !child->content)
|
||||||
|
{
|
||||||
|
AttVal *id = TY_(GetAttrByName)( child, "name" );
|
||||||
|
|
||||||
|
if (id)
|
||||||
|
TY_(RepairAttrValue)( doc, child->parent, "id", id->value );
|
||||||
|
|
||||||
|
TY_(DiscardElement)(doc, child);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (child->attributes)
|
||||||
|
TY_(DropAttrByName)( doc, child, "class" );
|
||||||
|
|
||||||
|
CleanNode(doc, child);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* insert meta element to force browser to recognize doc as UTF8 */
|
||||||
|
static void SetUTF8( TidyDocImpl* doc )
|
||||||
|
{
|
||||||
|
Node *head = TY_(FindHEAD)( doc );
|
||||||
|
|
||||||
|
if (head)
|
||||||
|
{
|
||||||
|
Node *node = TY_(InferredTag)(doc, TidyTag_META);
|
||||||
|
TY_(AddAttribute)( doc, node, "http-equiv", "Content-Type" );
|
||||||
|
TY_(AddAttribute)( doc, node, "content", "text/html; charset=UTF-8" );
|
||||||
|
TY_(InsertNodeAtStart)( head, node );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* clean html exported by Google Docs
|
||||||
|
|
||||||
|
- strip the script element, as the style sheet is a mess
|
||||||
|
- strip class attributes
|
||||||
|
- strip span elements, leaving their content in place
|
||||||
|
- replace <a name=...></a> by id on parent element
|
||||||
|
- strip empty <p> elements
|
||||||
|
*/
|
||||||
|
void TY_(CleanGoogleDocument)( TidyDocImpl* doc )
|
||||||
|
{
|
||||||
|
/* placeholder. CleanTree()/CleanNode() will not
|
||||||
|
** zap root element
|
||||||
|
*/
|
||||||
|
CleanNode( doc, &doc->root );
|
||||||
|
SetUTF8( doc );
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* local variables:
|
||||||
|
* mode: c
|
||||||
|
* indent-tabs-mode: nil
|
||||||
|
* c-basic-offset: 4
|
||||||
|
* eval: (c-set-offset 'substatement-open 0)
|
||||||
|
* end:
|
||||||
|
*/
|
19
src/gdoc.h
Normal file
19
src/gdoc.h
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
#ifndef __GDOC_H__
|
||||||
|
#define __GDOC_H__
|
||||||
|
|
||||||
|
/* gdoc.h -- clean up html exported by Google Docs
|
||||||
|
|
||||||
|
(c) 2012 (W3C) MIT, ERCIM, Keio University
|
||||||
|
See tidy.h for the copyright notice.
|
||||||
|
|
||||||
|
- strip the script element, as the style sheet is a mess
|
||||||
|
- strip class attributes
|
||||||
|
- strip span elements, leaving their content in place
|
||||||
|
- replace <a name=...></a> by id on parent element
|
||||||
|
- strip empty <p> elements
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
void TY_(CleanGoogleDocument)( TidyDocImpl* doc );
|
||||||
|
|
||||||
|
#endif /* __GDOC_H__ */
|
Loading…
Reference in a new issue