main code updates to do HTML5

This commit is contained in:
Geoff McLane 2014-08-03 20:33:29 +02:00
parent 292145c8e2
commit 78c0080eb8
18 changed files with 1999 additions and 1259 deletions

View file

@ -9,6 +9,13 @@
*/
#include "tidy.h"
#ifdef _MSC_VER
#include "sprtf.h"
#endif
#ifndef SPRTF
#define SPRTF printf
#endif
static FILE* errout = NULL; /* set to stderr */
/* static FILE* txtout = NULL; */ /* set to stdout */
@ -176,6 +183,9 @@ static const CmdOptDesc cmdopt_defs[] = {
{ "-bare",
"strip out smart quotes and em dashes, etc.",
"bare: yes", CmdOptProcDir, "-b" },
{ "-gdoc",
"produce clean version of html exported by google docs",
"gdoc: yes", CmdOptProcDir, "-g" },
{ "-numeric",
"output numeric rather than named entities",
"numeric-entities: yes", CmdOptProcDir, "-n" },
@ -186,8 +196,8 @@ static const CmdOptDesc cmdopt_defs[] = {
"suppress nonessential output",
"quiet: yes", CmdOptProcDir, "-q" },
{ "-omit",
"omit optional end tags",
"hide-endtags: yes", CmdOptProcDir },
"omit optional start tags and end tags",
"omit-optional-tags: yes", CmdOptProcDir },
{ "-xml",
"specify the input is well formed XML",
"input-xml: yes", CmdOptProcDir },
@ -411,14 +421,16 @@ static void help( ctmbstr prog )
{
printf( "%s [option...] [file...] [option...] [file...]\n", prog );
printf( "Utility to clean up and pretty print HTML/XHTML/XML\n");
printf( "See http://tidy.sourceforge.net/\n");
printf( "\n");
printf( "This is an HTML5-aware experimental fork of HTML Tidy.\n");
printf( "%s\n", tidyReleaseDate() );
printf( "\n");
#ifdef PLATFORM_NAME
printf( "Options for HTML Tidy for %s released on %s:\n",
PLATFORM_NAME, tidyReleaseDate() );
printf( "Options for HTML Tidy for %s:\n", PLATFORM_NAME );
#else
printf( "Options for HTML Tidy released on %s:\n", tidyReleaseDate() );
printf( "Options for HTML Tidy:\n");
#endif
printf( "\n");
@ -429,9 +441,27 @@ static void help( ctmbstr prog )
"to the man page.\n\n");
printf( "Input/Output default to stdin/stdout respectively.\n");
printf( "\n");
printf( "Single letter options apart from -f may be combined\n");
printf( "as in: tidy -f errs.txt -imu foo.html\n");
printf( "For further info on HTML see http://www.w3.org/MarkUp\n");
printf( "\n");
printf( "For more information on this HTML5-aware experimental fork of Tidy,\n" );
printf( "see http://w3c.github.com/tidy-html5/\n" );
printf( "\n");
printf( "For more information on HTML, see the following:\n" );
printf( "\n");
printf( " HTML: Edition for Web Authors (the latest HTML specification)\n");
printf( " http://dev.w3.org/html5/spec-author-view\n" );
printf( "\n");
printf( " HTML: The Markup Language (an HTML language reference)\n" );
printf( " http://dev.w3.org/html5/markup/\n" );
printf( "\n");
printf( "File bug reports at https://github.com/w3c/tidy-html5/issues/\n" );
printf( "or send questions and comments to html-tidy@w3.org\n" );
printf( "\n");
printf( "Validate your HTML documents using the W3C Nu Markup Validator:\n" );
printf( "\n");
printf( " http://validator.w3.org/nu/" );
printf( "\n");
}
@ -472,6 +502,7 @@ ctmbstr ConfigCategoryName( TidyConfigCategory id )
fprintf(stderr, "Fatal error: impossible value for id='%d'.\n", (int)id);
assert(0);
abort();
return "never_here"; /* only for the compiler warning */
}
/* Description of an option */
@ -898,10 +929,10 @@ static void optionvalues( TidyDoc tdoc )
static void version( void )
{
#ifdef PLATFORM_NAME
printf( "HTML Tidy for %s released on %s\n",
printf( "HTML Tidy for HTML5 for %s %s\n",
PLATFORM_NAME, tidyReleaseDate() );
#else
printf( "HTML Tidy released on %s\n", tidyReleaseDate() );
printf( "HTML Tidy for HTML5 %s\n", tidyReleaseDate() );
#endif
}
@ -923,6 +954,9 @@ int main( int argc, char** argv )
errout = stderr; /* initialize to stderr */
status = 0;
#ifdef _MSC_VER
set_log_file((char *)"temptidy.txt", 0);
#endif
#ifdef TIDY_CONFIG_FILE
if ( tidyFileExists( tdoc, TIDY_CONFIG_FILE) )
@ -977,7 +1011,7 @@ int main( int argc, char** argv )
tidyOptResetToDefault( tdoc, TidyIndentSpaces );
}
else if ( strcasecmp(arg, "omit") == 0 )
tidyOptSetBool( tdoc, TidyHideEndTags, yes );
tidyOptSetBool( tdoc, TidyOmitOptionalTags, yes );
else if ( strcasecmp(arg, "upper") == 0 )
tidyOptSetBool( tdoc, TidyUpperCaseTags, yes );
@ -985,6 +1019,9 @@ int main( int argc, char** argv )
else if ( strcasecmp(arg, "clean") == 0 )
tidyOptSetBool( tdoc, TidyMakeClean, yes );
else if ( strcasecmp(arg, "gdoc") == 0 )
tidyOptSetBool( tdoc, TidyGDocClean, yes );
else if ( strcasecmp(arg, "bare") == 0 )
tidyOptSetBool( tdoc, TidyMakeBare, yes );
@ -1202,6 +1239,10 @@ int main( int argc, char** argv )
tidyOptSetBool( tdoc, TidyMakeClean, yes );
break;
case 'g':
tidyOptSetBool( tdoc, TidyGDocClean, yes );
break;
case 'b':
tidyOptSetBool( tdoc, TidyMakeBare, yes );
break;
@ -1237,6 +1278,7 @@ int main( int argc, char** argv )
if ( argc > 1 )
{
htmlfil = argv[1];
SPRTF("Tidying '%s'\n", htmlfil);
if ( tidyOptGetBool(tdoc, TidyEmacs) )
tidyOptSetValue( tdoc, TidyEmacsFile, htmlfil );
status = tidyParseFile( tdoc, htmlfil );
@ -1263,10 +1305,17 @@ int main( int argc, char** argv )
else
{
ctmbstr outfil = tidyOptGetValue( tdoc, TidyOutFile );
if ( outfil )
if ( outfil ) {
status = tidySaveFile( tdoc, outfil );
else
} else {
#if !defined(NDEBUG) && defined(_MSC_VER)
static char tmp_buf[264];
sprintf(tmp_buf,"%s.html",get_log_file());
status = tidySaveFile( tdoc, tmp_buf );
#else
status = tidySaveStdout( tdoc );
#endif
}
}
}

View file

@ -937,6 +937,10 @@ TIDY_EXPORT Bool TIDY_CALL tidyNodeIsSTRIKE( TidyNode tnod );
TIDY_EXPORT Bool TIDY_CALL tidyNodeIsU( TidyNode tnod );
TIDY_EXPORT Bool TIDY_CALL tidyNodeIsMENU( TidyNode tnod );
/* HTML5 */
TIDY_EXPORT Bool TIDY_CALL tidyNodeIsDATALIST( TidyNode tnod ); // bit like OPTIONS
/** @} End NodeIsElementName group */
/** @} End NodeAsk group */

View file

@ -102,11 +102,14 @@ typedef enum
TidyOutFile, /**< File name to write markup to */
TidyWriteBack, /**< If true then output tidied markup */
TidyShowMarkup, /**< If false, normal output is suppressed */
TidyShowInfo, /**< If true, info-level messages are shown */
TidyShowWarnings, /**< However errors are always shown */
TidyQuiet, /**< No 'Parsing X', guessed DTD or summary */
TidyIndentContent, /**< Indent content of appropriate tags */
/**< "auto" does text/block level content indentation */
TidyHideEndTags, /**< Suppress optional end tags */
TidyCoerceEndTags, /**< Coerce end tags from start tags where probably intended */
TidyOmitOptionalTags,/**< Suppress optional start tags and end tags */
TidyHideEndTags, /**< Legacy name for TidyOmitOptionalTags */
TidyXmlTags, /**< Treat input as XML */
TidyXmlOut, /**< Create output as XML */
TidyXhtmlOut, /**< Output extensible HTML */
@ -117,9 +120,11 @@ typedef enum
TidyUpperCaseAttrs, /**< Output attributes in upper not lower case */
TidyMakeBare, /**< Make bare HTML: remove Microsoft cruft */
TidyMakeClean, /**< Replace presentational clutter by style rules */
TidyGDocClean, /**< Clean up HTML exported from Google Docs */
TidyLogicalEmphasis, /**< Replace i by em and b by strong */
TidyDropPropAttrs, /**< Discard proprietary attributes */
TidyDropFontTags, /**< Discard presentation tags */
TidyDropEmptyElems, /**< Discard empty elements */
TidyDropEmptyParas, /**< Discard empty p elements */
TidyFixComments, /**< Fix comments with adjacent hyphens */
TidyBreakBeforeBR, /**< Output newline before <br> or not? */
@ -192,6 +197,7 @@ typedef enum
#else
TidyPunctWrapNotUsed,
#endif
TidyMergeEmphasis, /**< Merge nested B and I elements */
TidyMergeDivs, /**< Merge multiple DIVs */
TidyDecorateInferredUL, /**< Mark inferred UL elements with no indent CSS */
TidyPreserveEntities, /**< Preserve entities */
@ -234,6 +240,7 @@ typedef enum
*/
typedef enum
{
TidyDoctypeHtml5, /**< <!DOCTYPE html> */
TidyDoctypeOmit, /**< Omit DOCTYPE altogether */
TidyDoctypeAuto, /**< Keep DOCTYPE in input. Set version to content */
TidyDoctypeStrict, /**< Convert document to HTML 4 strict content model */
@ -436,16 +443,20 @@ typedef enum
TidyTag_ARTICLE,
TidyTag_ASIDE,
TidyTag_AUDIO,
TidyTag_BDI,
TidyTag_CANVAS,
TidyTag_COMMAND,
TidyTag_DATALIST,
TidyTag_DETAILS,
TidyTag_DIALOG,
TidyTag_FIGCAPTION,
TidyTag_FIGURE,
TidyTag_FOOTER,
TidyTag_HEADER,
TidyTag_HGROUP,
TidyTag_MAIN,
TidyTag_MARK,
TidyTag_MENUITEM,
TidyTag_METER,
TidyTag_NAV,
TidyTag_OUTPUT,
@ -531,6 +542,7 @@ typedef enum
TidyAttr_HTTP_EQUIV, /**< HTTP_EQUIV= */
TidyAttr_ID, /**< ID= */
TidyAttr_ISMAP, /**< ISMAP= */
TidyAttr_ITEMPROP, /**< ITEMPROP= */
TidyAttr_LABEL, /**< LABEL= */
TidyAttr_LANG, /**< LANG= */
TidyAttr_LANGUAGE, /**< LANGUAGE= */

File diff suppressed because it is too large Load diff

View file

@ -125,14 +125,18 @@ extern const AttrVersion TY_(W3CAttrsFor_HGROUP)[];
extern const AttrVersion TY_(W3CAttrsFor_FIGURE)[];
extern const AttrVersion TY_(W3CAttrsFor_ARTICLE)[];
extern const AttrVersion TY_(W3CAttrsFor_ASIDE)[];
extern const AttrVersion TY_(W3CAttrsFor_BDI)[];
extern const AttrVersion TY_(W3CAttrsFor_NAV)[];
extern const AttrVersion TY_(W3CAttrsFor_SECTION)[];
extern const AttrVersion TY_(W3CAttrsFor_FOOTER)[];
extern const AttrVersion TY_(W3CAttrsFor_HEADER)[];
extern const AttrVersion TY_(W3CAttrsFor_DETAILS)[];
extern const AttrVersion TY_(W3CAttrsFor_DIALOG)[];
extern const AttrVersion TY_(W3CAttrsFor_COMMAND)[];
extern const AttrVersion TY_(W3CAttrsFor_MAIN)[];
extern const AttrVersion TY_(W3CAttrsFor_MARK)[];
extern const AttrVersion TY_(W3CAttrsFor_OUTPUT)[];
extern const AttrVersion TY_(W3CAttrsFor_MENUITEM)[];
extern const AttrVersion TY_(W3CAttrsFor_METER)[];
extern const AttrVersion TY_(W3CAttrsFor_PROGRESS)[];
extern const AttrVersion TY_(W3CAttrsFor_TIME)[];
@ -141,5 +145,8 @@ extern const AttrVersion TY_(W3CAttrsFor_AUDIO)[];
extern const AttrVersion TY_(W3CAttrsFor_VIDEO)[];
extern const AttrVersion TY_(W3CAttrsFor_CANVAS)[];
extern const AttrVersion TY_(W3CAttrsFor_SOURCE)[];
extern const AttrVersion TY_(W3CAttrsFor_EMBED)[];
extern const AttrVersion TY_(W3CAttrsFor_KEYGEN)[];
extern const AttrVersion TY_(W3CAttrsFor_WBR)[];
#endif /* __ATTRDICT_H__ */

View file

@ -2,7 +2,7 @@
(c) 1998-2009 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy-int.h"
@ -152,6 +152,7 @@ static const Attribute attribute_defs [] =
{ TidyAttr_HTTP_EQUIV, "http-equiv", CH_PCDATA }, /* META */
{ TidyAttr_ID, "id", CH_IDDEF },
{ TidyAttr_ISMAP, "ismap", CH_BOOL }, /* IMG */
{ TidyAttr_ITEMPROP, "itemprop", CH_PCDATA },
{ TidyAttr_LABEL, "label", CH_PCDATA }, /* OPT, OPTGROUP */
{ TidyAttr_LANG, "lang", CH_LANG },
{ TidyAttr_LANGUAGE, "language", CH_PCDATA }, /* SCRIPT */
@ -253,7 +254,7 @@ static const Attribute attribute_defs [] =
{ TidyAttr_SDASUFF, "sdasuff", CH_PCDATA }, /* SDATA attribute in HTML 2.0 */
{ TidyAttr_URN, "urn", CH_PCDATA }, /* for <a>, never implemented */
/* "HTML5" */
/* HTML5 */
{ TidyAttr_ASYNC, "async", CH_PCDATA },
{ TidyAttr_AUTOCOMPLETE, "autocomplete", CH_PCDATA },
{ TidyAttr_AUTOFOCUS, "autofocus", CH_PCDATA },
@ -362,7 +363,7 @@ static uint AttributeVersions(Node* node, AttVal* attval)
{
uint i;
/* "HTML5" data-* attributes */
/* HTML5 data-* attributes */
if (attval && attval->attribute)
if (TY_(tmbstrncmp)(attval->attribute, "data-", 5) == 0)
return (XH50 | HT50);
@ -744,6 +745,27 @@ AttVal* TY_(GetAttrByName)( Node *node, ctmbstr name )
return attr;
}
void TY_(DropAttrByName)( TidyDocImpl* doc, Node *node, ctmbstr name )
{
AttVal *attr, *prev = NULL, *next;
for (attr = node->attributes; attr != NULL; prev = attr, attr = next)
{
next = attr->next;
if (attr->attribute && TY_(tmbstrcmp)(attr->attribute, name) == 0)
{
if (prev)
prev->next = next;
else
node->attributes = next;
TY_(FreeAttribute)( doc, attr );
break;
}
}
}
AttVal* TY_(AddAttribute)( TidyDocImpl* doc,
Node *node, ctmbstr name, ctmbstr value )
{
@ -1360,11 +1382,8 @@ Bool TY_(IsValidHTMLID)(ctmbstr id)
if (!s)
return no;
if (!TY_(IsLetter)(*s++))
return no;
while (*s)
if (!TY_(IsNamechar)(*s++))
if (TY_(IsHTMLSpace)(*s++))
return no;
return yes;
@ -1807,9 +1826,11 @@ void CheckLang( TidyDocImpl* doc, Node *node, AttVal *attval)
/* checks type attribute */
void CheckType( TidyDocImpl* doc, Node *node, AttVal *attval)
{
ctmbstr const valuesINPUT[] = {"text", "password", "checkbox", "radio",
"submit", "reset", "file", "hidden",
"image", "button", NULL};
ctmbstr const valuesINPUT[] = {
"text", "password", "checkbox", "radio", "submit", "reset", "file",
"hidden", "image", "button", "color", "date", "datetime",
"datetime-local", "email", "month", "number", "range", "search",
"tel", "time", "url", "week", NULL};
ctmbstr const valuesBUTTON[] = {"button", "submit", "reset", NULL};
ctmbstr const valuesUL[] = {"disc", "square", "circle", NULL};
ctmbstr const valuesOL[] = {"1", "a", "i", NULL};

View file

@ -5,7 +5,7 @@
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "forward.h"
@ -81,6 +81,8 @@ const Attribute* TY_(FindAttribute)( TidyDocImpl* doc, AttVal *attval );
AttVal* TY_(GetAttrByName)( Node *node, ctmbstr name );
void TY_(DropAttrByName)( TidyDocImpl* doc, Node *node, ctmbstr name );
AttVal* TY_(AddAttribute)( TidyDocImpl* doc,
Node *node, ctmbstr name, ctmbstr value );
@ -217,6 +219,7 @@ uint TY_(NodeAttributeVersions)( Node* node, TidyAttrId id );
#define attrIsHTTP_EQUIV(av) AttrIsId( av, TidyAttr_HTTP_EQUIV )
#define attrIsID(av) AttrIsId( av, TidyAttr_ID )
#define attrIsISMAP(av) AttrIsId( av, TidyAttr_ISMAP )
#define attrIsITEMPROP(av) AttrIsId( av, TidyAttr_ITEMPROP )
#define attrIsLABEL(av) AttrIsId( av, TidyAttr_LABEL )
#define attrIsLANG(av) AttrIsId( av, TidyAttr_LANG )
#define attrIsLANGUAGE(av) AttrIsId( av, TidyAttr_LANGUAGE )

View file

@ -4,9 +4,6 @@
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
/*
config files associate a property name with a value.
// comments can start at the beginning of a line
@ -130,6 +127,7 @@ static const ctmbstr newlinePicks[] =
static const ctmbstr doctypePicks[] =
{
"html5",
"omit",
"auto",
"strict",
@ -200,7 +198,7 @@ static ParseProperty ParseSorter;
static ParseProperty ParseCharEnc;
static ParseProperty ParseNewline;
/* omit | auto | strict | loose | <fpi> */
/* html5 | omit | auto | strict | loose | <fpi> */
static ParseProperty ParseDocType;
/* keep-first or keep-last? */
@ -213,9 +211,9 @@ static const TidyOptionImpl option_defs[] =
{ TidyIndentSpaces, PP, "indent-spaces", IN, 2, ParseInt, NULL },
{ TidyWrapLen, PP, "wrap", IN, 68, ParseInt, NULL },
{ TidyTabSize, PP, "tab-size", IN, 8, ParseInt, NULL },
{ TidyCharEncoding, CE, "char-encoding", IN, ASCII, ParseCharEnc, charEncPicks },
{ TidyInCharEncoding, CE, "input-encoding", IN, LATIN1, ParseCharEnc, charEncPicks },
{ TidyOutCharEncoding, CE, "output-encoding", IN, ASCII, ParseCharEnc, charEncPicks },
{ TidyCharEncoding, CE, "char-encoding", IN, UTF8, ParseCharEnc, charEncPicks },
{ TidyInCharEncoding, CE, "input-encoding", IN, UTF8, ParseCharEnc, charEncPicks },
{ TidyOutCharEncoding, CE, "output-encoding", IN, UTF8, ParseCharEnc, charEncPicks },
{ TidyNewline, CE, "newline", IN, DLF, ParseNewline, newlinePicks },
{ TidyDoctypeMode, MU, "doctype-mode", IN, TidyDoctypeAuto, NULL, doctypePicks },
{ TidyDoctype, MU, "doctype", ST, 0, ParseDocType, doctypePicks },
@ -229,9 +227,12 @@ static const TidyOptionImpl option_defs[] =
{ TidyOutFile, MS, "output-file", ST, 0, ParseString, NULL },
{ TidyWriteBack, MS, "write-back", BL, no, ParseBool, boolPicks },
{ TidyShowMarkup, PP, "markup", BL, yes, ParseBool, boolPicks },
{ TidyShowInfo, DG, "show-info", BL, yes, ParseBool, boolPicks },
{ TidyShowWarnings, DG, "show-warnings", BL, yes, ParseBool, boolPicks },
{ TidyQuiet, MS, "quiet", BL, no, ParseBool, boolPicks },
{ TidyIndentContent, PP, "indent", IN, TidyNoState, ParseAutoBool, autoBoolPicks },
{ TidyCoerceEndTags, MU, "coerce-endtags", BL, yes, ParseBool, boolPicks },
{ TidyOmitOptionalTags, MU, "omit-optional-tags", BL, no, ParseBool, boolPicks },
{ TidyHideEndTags, MU, "hide-endtags", BL, no, ParseBool, boolPicks },
{ TidyXmlTags, MU, "input-xml", BL, no, ParseBool, boolPicks },
{ TidyXmlOut, MU, "output-xml", BL, no, ParseBool, boolPicks },
@ -242,9 +243,11 @@ static const TidyOptionImpl option_defs[] =
{ TidyUpperCaseAttrs, MU, "uppercase-attributes", BL, no, ParseBool, boolPicks },
{ TidyMakeBare, MU, "bare", BL, no, ParseBool, boolPicks },
{ TidyMakeClean, MU, "clean", BL, no, ParseBool, boolPicks },
{ TidyGDocClean, MU, "gdoc", BL, no, ParseBool, boolPicks },
{ TidyLogicalEmphasis, MU, "logical-emphasis", BL, no, ParseBool, boolPicks },
{ TidyDropPropAttrs, MU, "drop-proprietary-attributes", BL, no, ParseBool, boolPicks },
{ TidyDropFontTags, MU, "drop-font-tags", BL, no, ParseBool, boolPicks },
{ TidyDropEmptyElems, MU, "drop-empty-elements", BL, yes, ParseBool, boolPicks },
{ TidyDropEmptyParas, MU, "drop-empty-paras", BL, yes, ParseBool, boolPicks },
{ TidyFixComments, MU, "fix-bad-comments", BL, yes, ParseBool, boolPicks },
{ TidyBreakBeforeBR, PP, "break-before-br", BL, no, ParseBool, boolPicks },
@ -303,6 +306,7 @@ static const TidyOptionImpl option_defs[] =
#if SUPPORT_ASIAN_ENCODINGS
{ TidyPunctWrap, PP, "punctuation-wrap", BL, no, ParseBool, boolPicks },
#endif
{ TidyMergeEmphasis, MU, "merge-emphasis", BL, yes, ParseBool, boolPicks },
{ TidyMergeDivs, MU, "merge-divs", IN, TidyAutoState, ParseAutoBool, autoBoolPicks },
{ TidyDecorateInferredUL, MU, "decorate-inferred-ul", BL, no, ParseBool, boolPicks },
{ TidyPreserveEntities, MU, "preserve-entities", BL, no, ParseBool, boolPicks },
@ -1425,7 +1429,7 @@ ctmbstr TY_(CharEncodingOptName)( int encoding )
}
/*
doctype: omit | auto | strict | loose | <fpi>
doctype: html5 | omit | auto | strict | loose | <fpi>
where the fpi is a string similar to
@ -1462,6 +1466,8 @@ Bool ParseDocType( TidyDocImpl* doc, const TidyOptionImpl* option )
if ( TY_(tmbstrcasecmp)(buf, "auto") == 0 )
dtmode = TidyDoctypeAuto;
else if ( TY_(tmbstrcasecmp)(buf, "html5") == 0 )
dtmode = TidyDoctypeHtml5;
else if ( TY_(tmbstrcasecmp)(buf, "omit") == 0 )
dtmode = TidyDoctypeOmit;
else if ( TY_(tmbstrcasecmp)(buf, "strict") == 0 )

View file

@ -2,7 +2,7 @@
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
/*
@ -39,6 +39,13 @@
#include "clean.h"
#include "utf8.h"
#include "streamio.h"
#ifdef _MSC_VER
#include "sprtf.h"
#endif
#ifndef SPRTF
#define SPRTF printf
#endif
/* Forward references
*/
@ -113,6 +120,9 @@ int TY_(HTMLVersion)(TidyDocImpl* doc)
!cfgBool(doc, TidyHtmlOut);
Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver;
if (xhtml && dtver == VERS_UNKNOWN) return XH50;
if (dtver == VERS_UNKNOWN) return HT50;
for (i = 0; W3C_Doctypes[i].name; ++i)
{
if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
@ -171,7 +181,7 @@ static uint GetVersFromFPI(ctmbstr fpi)
uint i;
for (i = 0; W3C_Doctypes[i].name; ++i)
if (TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
if (W3C_Doctypes[i].fpi != NULL && TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
return W3C_Doctypes[i].vers;
return 0;
@ -224,6 +234,11 @@ Bool TY_(IsLetter)(uint c)
return (map & letter)!=0;
}
Bool TY_(IsHTMLSpace)(uint c)
{
return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d;
}
Bool TY_(IsNamechar)(uint c)
{
uint map = MAP(c);
@ -1393,10 +1408,10 @@ Bool TY_(AddGenerator)( TidyDocImpl* doc )
if (head)
{
#ifdef PLATFORM_NAME
TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for "PLATFORM_NAME" (vers %s), see www.w3.org",
TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 (experimental) for "PLATFORM_NAME" %s",
tidyReleaseDate());
#else
TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy (vers %s), see www.w3.org", tidyReleaseDate());
TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 (experimental) %s", tidyReleaseDate());
#endif
for ( node = head->content; node; node = node->next )
@ -1562,6 +1577,12 @@ Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc )
switch(dtmode)
{
case TidyDoctypeHtml5:
/* HTML5 */
TY_(RepairAttrValue)(doc, doctype, pub, NULL);
TY_(RepairAttrValue)(doc, doctype, sys, NULL);
lexer->versionEmitted = XH50;
break;
case TidyDoctypeStrict:
/* XHTML 1.0 Strict */
TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
@ -1580,7 +1601,11 @@ Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc )
TY_(RepairAttrValue)(doc, doctype, sys, "");
break;
case TidyDoctypeAuto:
if (lexer->versions & XH11 && lexer->doctype == XH11)
if (lexer->doctype == VERS_UNKNOWN) {
lexer->versionEmitted = XH50;
return yes;
}
else if (lexer->versions & XH11 && lexer->doctype == XH11)
{
if (!TY_(GetAttrByName)(doctype, sys))
TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
@ -1618,10 +1643,6 @@ Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc )
TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
lexer->versionEmitted = X10T;
}
else if (lexer->versions & XH50)
{
lexer->versionEmitted = XH50;
}
else
{
if (doctype)
@ -1678,6 +1699,9 @@ Bool TY_(FixDocType)( TidyDocImpl* doc )
switch (dtmode)
{
case TidyDoctypeHtml5:
guessed = HT50;
break;
case TidyDoctypeStrict:
guessed = H41S;
break;
@ -2010,6 +2034,7 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode );
Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode )
{
Node *node;
Lexer* lexer = doc->lexer;
if (lexer->pushed || lexer->itoken)
@ -2030,33 +2055,61 @@ Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode )
/* duplicate inlines in preference to pushed text nodes when appropriate */
lexer->pushed = no;
if (lexer->token->type != TextNode
|| !(lexer->insert || lexer->inode))
return lexer->token;
return lexer->itoken = TY_(InsertedToken)( doc );
|| !(lexer->insert || lexer->inode)) {
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning pushed token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node;
}
lexer->itoken = TY_(InsertedToken)( doc );
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning inserted token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node;
}
assert( !(lexer->pushed || lexer->itoken) );
/* at start of block elements, unclosed inline
elements are inserted into the token stream */
if (lexer->insert || lexer->inode)
return lexer->token = TY_(InsertedToken)( doc );
if (lexer->insert || lexer->inode) {
lexer->token = TY_(InsertedToken)( doc );
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning Inserted token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node;
}
if (mode == CdataContent)
{
assert( lexer->parent != NULL );
return GetCDATA(doc, lexer->parent);
node = GetCDATA(doc, lexer->parent);
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning Cdatacontent token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node;
}
return GetTokenFromStream( doc, mode );
}
#if !defined(NDEBUG) && defined(_MSC_VER)
static void check_me(char *name)
{
SPRTF("Have node %s\n", name);
}
#endif
static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
{
Lexer* lexer = doc->lexer;
uint c, badcomment = 0;
Bool isempty = no;
AttVal *attributes = NULL;
Node *node;
/* Lexer->token must be set on return. Nullify it for safety. */
lexer->token = NULL;
@ -2170,7 +2223,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
#ifdef TIDY_STORE_ORIGINAL_TEXT
StoreOriginalTextInToken(doc, lexer->token, 3);
#endif
return lexer->token;
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning text token len %d...\n", node->end - node->start );
#endif
return node;
}
continue; /* no text so keep going */
@ -2397,7 +2454,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
#ifdef TIDY_STORE_ORIGINAL_TEXT
StoreOriginalTextInToken(doc, lexer->token, 0); /* hmm... */
#endif
return lexer->token; /* the endtag token */
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning endtag token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the endtag token */
case LEX_STARTTAG: /* first letter of tagname */
c = TY_(ReadChar)(doc->docIn);
@ -2471,7 +2532,19 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
#ifdef TIDY_STORE_ORIGINAL_TEXT
StoreOriginalTextInToken(doc, lexer->token, 0);
#endif
return lexer->token; /* return start tag */
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning starttag token '%s'...\n", node->element ? node->element : "<blank>");
if (node->element) {
//if (stricmp(node->element,"datalist") == 0) {
// check_me(node->element);
//} else
if (stricmp(node->element,"option") == 0) {
check_me(node->element);
}
}
#endif
return node; /* return start tag */
case LEX_COMMENT: /* seen <!-- so look for --> */
@ -2509,7 +2582,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
else
TY_(UngetChar)(c, doc->docIn);
return lexer->token;
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning comment token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node;
}
/* note position of first such error in the comment */
@ -2554,7 +2631,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
/* make a note of the version named by the 1st doctype */
if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags))
lexer->doctype = FindGivenVersion(doc, lexer->token);
return lexer->token;
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning doctype token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node;
case LEX_PROCINSTR: /* seen <? so look for '>' */
/* check for PHP preprocessor instructions <?php ... ?> */
@ -2636,7 +2717,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
lexer->state = LEX_CONTENT;
lexer->waswhite = no;
return lexer->token;
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning procinstr token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node;
case LEX_ASP: /* seen <% so look for "%>" */
if (c != '%')
@ -2657,7 +2742,14 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
lexer->lexbuf[lexer->lexsize] = '\0';
lexer->state = LEX_CONTENT;
lexer->waswhite = no;
return lexer->token = AspToken(doc);
lexer->token = AspToken(doc);
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning ASP token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the endtag token */
case LEX_JSTE: /* seen <# so look for "#>" */
if (c != '#')
@ -2678,7 +2770,13 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
lexer->lexbuf[lexer->lexsize] = '\0';
lexer->state = LEX_CONTENT;
lexer->waswhite = no;
return lexer->token = JsteToken(doc);
lexer->token = JsteToken(doc);
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning JSTE token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the JSTE token */
case LEX_PHP: /* seen "<?php" so look for "?>" */
if (c != '?')
@ -2698,7 +2796,12 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
lexer->lexbuf[lexer->lexsize] = '\0';
lexer->state = LEX_CONTENT;
lexer->waswhite = no;
return lexer->token = PhpToken(doc);
lexer->token = PhpToken(doc);
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning PHP token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the PHP token */
case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */
@ -2728,7 +2831,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
lexer->waswhite = no;
lexer->token = XmlDeclToken(doc);
lexer->token->attributes = attributes;
return lexer->token;
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning xml token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the xml token */
}
av = TY_(NewAttribute)(doc);
@ -2756,7 +2863,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
lexer->waswhite = no;
lexer->token = XmlDeclToken(doc);
lexer->token->attributes = attributes;
return lexer->token;
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning XML token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the XML token */
case LEX_SECTION: /* seen "<![" so look for "]>" */
if (c == '[')
@ -2787,7 +2898,12 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
lexer->lexbuf[lexer->lexsize] = '\0';
lexer->state = LEX_CONTENT;
lexer->waswhite = no;
return lexer->token = SectionToken(doc);
lexer->token = SectionToken(doc);
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning SECTION token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the SECTION token */
case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
if (c != ']')
@ -2817,7 +2933,12 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
lexer->lexbuf[lexer->lexsize] = '\0';
lexer->state = LEX_CONTENT;
lexer->waswhite = no;
return lexer->token = CDATAToken(doc);
lexer->token = CDATAToken(doc);
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning CDATA token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the CDATA token */
}
}
@ -2838,7 +2959,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
#ifdef TIDY_STORE_ORIGINAL_TEXT
StoreOriginalTextInToken(doc, lexer->token, 0); /* ? */
#endif
return lexer->token;
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning textstring token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the textstring token */
}
}
else if (lexer->state == LEX_COMMENT) /* comment */
@ -2850,9 +2975,17 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
lexer->lexbuf[lexer->lexsize] = '\0';
lexer->state = LEX_CONTENT;
lexer->waswhite = no;
return lexer->token = CommentToken(doc);
lexer->token = CommentToken(doc);
node = lexer->token;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning COMMENT token '%s'...\n", node->element ? node->element : "<blank>");
#endif
return node; /* the COMMENT token */
}
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Returning NULL...\n");
#endif
return NULL;
}

View file

@ -5,10 +5,7 @@
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
/*
Given an input source, it returns a sequence of tokens.
GetToken(source) gets the next token
@ -189,7 +186,7 @@ typedef enum
/* special flag */
#define VERS_XML 65536u
/* "HTML5" */
/* HTML5 */
#define HT50 131072u
#define XH50 262144u
@ -202,6 +199,8 @@ typedef enum
#define VERS_FRAMESET (H40F|H41F|X10F)
#define VERS_XHTML11 (XH11)
#define VERS_BASIC (XB10)
/* HTML5 */
#define VERS_HTML5 (HT50|XH50)
/* meta symbols */
#define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
@ -411,6 +410,7 @@ void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
Bool TY_(IsWhite)(uint c);
Bool TY_(IsDigit)(uint c);
Bool TY_(IsLetter)(uint c);
Bool TY_(IsHTMLSpace)(uint c);
Bool TY_(IsNewline)(uint c);
Bool TY_(IsNamechar)(uint c);
Bool TY_(IsXMLLetter)(uint c);

View file

@ -6,7 +6,7 @@
You should only need to edit this file and tidy.c
to localize HTML tidy. *** This needs checking ***
*/
#include "tidy-int.h"
@ -101,6 +101,8 @@ static struct _msgfmt
{ NESTED_QUOTATION, "nested q elements, possible typo." }, /* Warning */
{ OBSOLETE_ELEMENT, "replacing obsolete element %s by %s" }, /* Warning */
{ COERCE_TO_ENDTAG_WARN, "<%s> is probably intended as </%s>" }, /* Warning */
/* HTML5 */
{ REMOVED_HTML5, "%s element removed from HTML5" }, /* Warning */
/* ReportNotice */
{ TRIM_EMPTY_ELEMENT, "trimming empty %s" }, /* Notice */
@ -320,7 +322,7 @@ static const TidyOptionId TidyIndentContentLinks[] =
static const TidyOptionId TidyIndentSpacesLinks[] =
{ TidyIndentContent, TidyUnknownOption };
static const TidyOptionId TidyWrapAttValsLinks[] =
{ TidyWrapScriptlets, TidyUnknownOption };
{ TidyWrapScriptlets, TidyLiteralAttribs, TidyUnknownOption };
static const TidyOptionId TidyWrapScriptletsLinks[] =
{ TidyWrapAttVals, TidyUnknownOption };
static const TidyOptionId TidyCharEncodingLinks[] =
@ -353,6 +355,8 @@ static const TidyOptionId TidyDropFontTagsLinks[] =
{ TidyMakeClean, TidyUnknownOption };
static const TidyOptionId TidyMakeCleanTagsLinks[] =
{ TidyDropFontTags, TidyUnknownOption };
static const TidyOptionId TidyGDocCleanLinks[] =
{ TidyMakeClean, TidyUnknownOption };
/* Documentation of options */
static const TidyOptionDoc option_docs[] =
@ -399,14 +403,24 @@ static const TidyOptionDoc option_docs[] =
"on the HTML saved by Microsoft Office products. "
, TidyMakeCleanTagsLinks
},
{TidyGDocClean,
"This option specifies if Tidy "
"should enable specific behavior for cleaning up HTML exported from "
"Google Docs. "
, TidyMakeCleanTagsLinks
},
{TidyDoctype,
"This option specifies the DOCTYPE declaration generated by Tidy. If set "
"to \"omit\" the output won't contain a DOCTYPE declaration. If set to "
"\"auto\" (the default) Tidy will use an educated guess based upon the "
"contents of the document. If set to \"strict\", Tidy will set the DOCTYPE "
"to the strict DTD. If set to \"loose\", the DOCTYPE is set to the loose "
"(transitional) DTD. Alternatively, you can supply a string for the formal "
"public identifier (FPI).<br />"
"This option specifies the DOCTYPE declaration generated by Tidy.<br />"
"If set to \"omit\" the output won't contain a DOCTYPE declaration.<br />"
"If set to \"html5\" the DOCTYPE is set to \"&lt;!DOCTYPE html>\".<br />"
"If set to \"auto\" (the default) Tidy will use an educated guess based "
"upon the contents of the document.<br />"
"If set to \"strict\", Tidy will set the DOCTYPE to the HTML4 or XHTML1 "
"strict DTD.<br />"
"If set to \"loose\", the DOCTYPE is set to the HTML4 or XHTML1 loose "
"(transitional) DTD. <br />"
"Alternatively, you can supply a string for the formal public identifier "
"(FPI).<br />"
"<br />"
"For example: <br />"
"doctype: \"-//ACME//DTD HTML 3.14159//EN\"<br />"
@ -419,6 +433,9 @@ static const TidyOptionDoc option_docs[] =
"<code>--numeric-entities yes</code>. This option does not offer a "
"validation of the document conformance. "
},
{TidyDropEmptyElems,
"This option specifies if Tidy should discard empty elements. "
},
{TidyDropEmptyParas,
"This option specifies if Tidy should discard empty paragraphs. "
},
@ -460,10 +477,22 @@ static const TidyOptionDoc option_docs[] =
{TidyHideComments,
"This option specifies if Tidy should print out comments. "
},
{TidyCoerceEndTags,
"This option specifies if Tidy should coerce a start tag into an end tag "
"in cases where it looks like an end tag was probably intended; "
"for example, given &lt;span&gt;foo &lt;b&gt;bar&lt;b&gt; baz&lt;/span&gt;, "
"Tidy will output &lt;span&gt;foo &lt;b&gt;bar&lt;/b&gt; baz&lt;/span&gt;. "
},
{TidyOmitOptionalTags,
"This option specifies if Tidy should omit optional start tags and end tags "
"when generating output. Setting this option causes all tags for the "
"html, head, and body elements to be omitted from output, as well as such "
"end tags as &lt;/p&gt;, &lt;/li&gt;, &lt;/dt&gt;, &lt;/dd&gt;, "
"&lt;/option&gt;, &lt;/tr&gt;, &lt;/td&gt;, and &lt;/th&gt;. "
"This option is ignored for XML output. "
},
{TidyHideEndTags,
"This option specifies if Tidy should omit optional end-tags when "
"generating the pretty printed markup. This option is ignored if you are "
"outputting to XML. "
"This option is an alias for omit-optional-tags. "
},
{TidyIndentCdata,
"This option specifies if Tidy should indent &lt;![CDATA[]]&gt; sections. "
@ -494,6 +523,12 @@ static const TidyOptionDoc option_docs[] =
"that takes a list of predefined values to lower case. This is required "
"for XHTML documents. "
},
{TidyMergeEmphasis,
"This option specifies if Tidy should merge nested &lt;b&gt; and &lt;i&gt; "
"elements; for example, for the case "
"&lt;b class=\"rtop-2\"&gt;foo &lt;b class=\"r2-2\"&gt;bar&lt;/b&gt; baz&lt;/b&gt;, "
"Tidy will output &lt;b class=\"rtop-2\"&gt;foo bar baz&lt;/b&gt;. "
},
{TidyMergeDivs,
"Can be used to modify behavior of -c (--clean yes) option. "
"This option specifies if Tidy should merge nested &lt;div&gt; such as "
@ -644,6 +679,9 @@ static const TidyOptionDoc option_docs[] =
"This option specifies the number Tidy uses to determine if further errors "
"should be shown. If set to 0, then no errors are shown. "
},
{TidyShowInfo,
"This option specifies if Tidy should display info-level messages. "
},
{TidyShowWarnings,
"This option specifies if Tidy should suppress warnings. This can be "
"useful when a few errors are hidden in a flurry of warnings. "
@ -670,8 +708,14 @@ static const TidyOptionDoc option_docs[] =
,TidyIndentSpacesLinks
},
{TidyLiteralAttribs,
"This option specifies if Tidy should ensure that whitespace characters "
"within attribute values are passed through unchanged. "
"This option specifies how Tidy deals with whitespace characters within "
"attribute values. If the value is \"no\" (the default), Tidy \"munges\" "
"or \"normalizes\" attribute values by replacing any newline or tab "
"character with a single space character, and further by replacing "
"any sequences of multiple whitespace characters with a single space. "
"To force tidy to preserve the original, literal values of all attributes, "
"and ensure that whitespace characters within attribute values are passed "
"through unchanged, set this option to \"yes\". "
},
{TidyShowMarkup,
"This option specifies if Tidy should generate a pretty printed version "
@ -706,9 +750,18 @@ static const TidyOptionDoc option_docs[] =
"pseudo elements, which look like: &lt;% ... %&gt;. "
},
{TidyWrapAttVals,
"This option specifies if Tidy should line wrap attribute values, for "
"easier editing. This option can be set independently of "
"wrap-script-literals. "
"This option specifies if Tidy should line-wrap attribute values, for "
"easier editing. Line wrapping means that if the value of an attribute "
"causes a line to exceed the width specified by the \"wrap\" option, "
"tidy will add one or more line breaks to the value, causing it to "
"wrapped into multiple lines. Note that this option can be set "
"independently of wrap-script-literals. Also note that by default, Tidy "
"\"munges\" or \"normalizes\" attribute values by replacing any newline "
"or tab character with a single space character, and further by replacing "
"any sequences of multiple whitespace characters with a single space. "
"To force Tidy to preserve the original, literal values of all attributes, "
"and ensure that whitespace characters within attribute values are passed "
"through unchanged, set the literal-attributes option to \"yes\". "
,TidyWrapAttValsLinks
},
{TidyWrapJste,
@ -1047,6 +1100,7 @@ __attribute__((format(printf, 2, 3)))
void message( TidyDocImpl* doc, TidyReportLevel level, ctmbstr msg, ... )
{
va_list args;
if (level == TidyInfo && !cfgBool(doc, TidyShowInfo)) return;
va_start( args, msg );
messagePos( doc, level, 0, 0, msg, args );
va_end( args );
@ -1367,14 +1421,14 @@ void TY_(ReportAccessWarning)( TidyDocImpl* doc, Node* node, uint code )
{
ctmbstr fmt = GetFormatFromCode(code);
doc->badAccess |= BA_WAI;
messageNode( doc, TidyAccess, node, fmt );
messageNode( doc, TidyAccess, node, "%s", fmt );
}
void TY_(ReportAccessError)( TidyDocImpl* doc, Node* node, uint code )
{
ctmbstr fmt = GetFormatFromCode(code);
doc->badAccess |= BA_WAI;
messageNode( doc, TidyAccess, node, fmt );
messageNode( doc, TidyAccess, node, "%s", fmt );
}
#endif /* SUPPORT_ACCESSIBILITY_CHECKS */
@ -1393,7 +1447,7 @@ void TY_(ReportWarning)(TidyDocImpl* doc, Node *element, Node *node, uint code)
switch (code)
{
case NESTED_QUOTATION:
messageNode(doc, TidyWarning, rpt, fmt);
messageNode(doc, TidyWarning, rpt, "%s", fmt);
break;
case OBSOLETE_ELEMENT:
@ -1401,6 +1455,7 @@ void TY_(ReportWarning)(TidyDocImpl* doc, Node *element, Node *node, uint code)
messageNode(doc, TidyWarning, rpt, fmt, elemdesc, nodedesc);
break;
case REMOVED_HTML5:
case NESTED_EMPHASIS:
messageNode(doc, TidyWarning, rpt, fmt, nodedesc);
break;
@ -1474,7 +1529,7 @@ void TY_(ReportError)(TidyDocImpl* doc, Node *element, Node *node, uint code)
case INCONSISTENT_NAMESPACE:
case DOCTYPE_AFTER_TAGS:
case DTYPE_NOT_UPPER_CASE:
messageNode(doc, TidyWarning, rpt, fmt);
messageNode(doc, TidyWarning, rpt, "%s", fmt);
break;
case COERCE_TO_ENDTAG:
@ -1493,7 +1548,7 @@ void TY_(ReportError)(TidyDocImpl* doc, Node *element, Node *node, uint code)
case ENCODING_IO_CONFLICT:
case MISSING_DOCTYPE:
case SPACE_PRECEDING_XMLDECL:
messageNode(doc, TidyWarning, node, fmt);
messageNode(doc, TidyWarning, node, "%s", fmt);
break;
case TRIM_EMPTY_ELEMENT:
@ -1542,7 +1597,7 @@ void TY_(ReportFatal)( TidyDocImpl* doc, Node *element, Node *node, uint code)
{
case SUSPECTED_MISSING_QUOTE:
case DUPLICATE_FRAMESET:
messageNode(doc, TidyError, rpt, fmt);
messageNode(doc, TidyError, rpt, "%s", fmt);
break;
case UNKNOWN_ELEMENT:
@ -1775,11 +1830,14 @@ void TY_(NeedsAuthorIntervention)( TidyDocImpl* doc )
void TY_(GeneralInfo)( TidyDocImpl* doc )
{
tidy_out(doc, "To learn more about HTML Tidy see http://tidy.sourceforge.net\n");
tidy_out(doc, "Please fill bug reports and queries using the \"tracker\" on the Tidy web site.\n");
tidy_out(doc, "Additionally, questions can be sent to html-tidy@w3.org\n");
tidy_out(doc, "HTML and CSS specifications are available from http://www.w3.org/\n");
tidy_out(doc, "Lobby your company to join W3C, see http://www.w3.org/Consortium\n");
if (!cfgBool(doc, TidyShowInfo)) return;
tidy_out(doc, "About this fork of Tidy: http://w3c.github.com/tidy-html5/\n");
tidy_out(doc, "Bug reports and comments: https://github.com/w3c/tidy-html5/issues/\n");
tidy_out(doc, "Or send questions and comments to html-tidy@w3.org\n");
tidy_out(doc, "Latest HTML specification: http://dev.w3.org/html5/spec-author-view/\n");
tidy_out(doc, "HTML language reference: http://dev.w3.org/html5/markup/\n");
tidy_out(doc, "Validate your HTML5 documents: http://validator.w3.org/nu/\n");
tidy_out(doc, "Lobby your company to join the W3C: http://www.w3.org/Consortium\n");
}
#if SUPPORT_ACCESSIBILITY_CHECKS

View file

@ -5,7 +5,7 @@
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "forward.h"
@ -154,7 +154,9 @@ void TY_(ReportFatal)(TidyDocImpl* doc, Node* element, Node* node, uint code);
#define MISSING_ATTRIBUTE 86
#define WHITE_IN_URI 87
#define PREVIOUS_LOCATION 88 /* last */
#define REMOVED_HTML5 88 /* this element removed from HTML5 */
#define PREVIOUS_LOCATION 89 /* last */
/* character encoding errors */

View file

@ -2,7 +2,7 @@
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy-int.h"
@ -12,6 +12,13 @@
#include "clean.h"
#include "tags.h"
#include "tmbstr.h"
#ifdef _MSC_VER
#include "sprtf.h"
#endif
#ifndef SPRTF
#define SPRTF printf
#endif
#ifdef AUTO_INPUT_ENCODING
#include "charsets.h"
@ -234,6 +241,9 @@ void TY_(InsertNodeAfterElement)(Node *element, Node *node)
static Bool CanPrune( TidyDocImpl* doc, Node *element )
{
if ( !cfgBool(doc, TidyDropEmptyElems) )
return no;
if ( TY_(nodeIsText)(element) )
return yes;
@ -278,6 +288,13 @@ static Bool CanPrune( TidyDocImpl* doc, Node *element )
if (nodeIsTEXTAREA(element))
return no;
/* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */
if (nodeIsCANVAS(element))
return no;
if (nodeIsPROGRESS(element))
return no;
if ( attrGetID(element) || attrGetNAME(element) )
return no;
@ -296,6 +313,10 @@ static Bool CanPrune( TidyDocImpl* doc, Node *element )
if (nodeIsCOLGROUP(element))
return no;
/* HTML5 - do NOT drop empty option if it has attributes */
if ( nodeIsOPTION(element) && element->attributes != NULL )
return no;
return yes;
}
@ -811,13 +832,25 @@ static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
*/
void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
{
#if !defined(NDEBUG) && defined(_MSC_VER)
static int in_parse_block = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node;
Bool checkstack = yes;
uint istackbase = 0;
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block++;
SPRTF("Entering ParseBlock %d...\n",in_parse_block);
#endif
if ( element->tag->model & CM_EMPTY )
if ( element->tag->model & CM_EMPTY ) {
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlockL 1 %d...\n",in_parse_block);
#endif
return;
}
if ( nodeIsFORM(element) &&
DescendantOf(element, TidyTag_FORM) )
@ -860,6 +893,10 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
element->closed = yes;
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block);
#endif
return;
}
@ -951,6 +988,10 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
{
TY_(UngetToken)( doc );
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block);
#endif
return;
}
}
@ -1111,6 +1152,10 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
{
TY_(UngetToken)( doc );
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 3 %d...\n",in_parse_block);
#endif
return;
}
}
@ -1127,6 +1172,10 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
lexer->istackbase = istackbase;
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 4 %d...\n",in_parse_block);
#endif
return;
}
}
@ -1177,6 +1226,10 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
element->parent->tag->parser == TY_(ParseList) )
{
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 5 %d...\n",in_parse_block);
#endif
return;
}
@ -1188,6 +1241,10 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
if ( nodeIsDL(element->parent) )
{
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 6 %d...\n",in_parse_block);
#endif
return;
}
@ -1198,8 +1255,13 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
/* http://tidy.sf.net/issue/1316307 */
/* In exiled mode, return so table processing can
continue. */
if (lexer->exiled)
if (lexer->exiled) {
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 7 %d...\n",in_parse_block);
#endif
return;
}
node = TY_(InferredTag)(doc, TidyTag_TABLE);
}
else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
@ -1209,12 +1271,20 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
TY_(PopInline)( doc, NULL );
lexer->istackbase = istackbase;
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 8 %d...\n",in_parse_block);
#endif
return;
}
else
{
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 9 %d...\n",in_parse_block);
#endif
return;
}
}
@ -1278,15 +1348,31 @@ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
}
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_block--;
SPRTF("Exit ParseBlock 10 %d...\n",in_parse_block);
#endif
}
void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
{
#if !defined(NDEBUG) && defined(_MSC_VER)
static int in_parse_inline = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node, *parent;
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline++;
SPRTF("Entering ParseInline %d...\n",in_parse_inline);
#endif
if (element->tag->model & CM_EMPTY)
if (element->tag->model & CM_EMPTY) {
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 1 %d...\n",in_parse_inline);
#endif
return;
}
/*
ParseInline is used for some block level elements like H1 to H6
@ -1363,6 +1449,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
element->closed = yes;
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 2 %d...\n",in_parse_inline);
#endif
return;
}
@ -1384,6 +1474,7 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
&& !nodeIsSUP(node)
&& !nodeIsQ(node)
&& !nodeIsSPAN(node)
&& cfgBool(doc, TidyCoerceEndTags)
)
{
/* proceeds only if "node" does not have any attribute and
@ -1442,7 +1533,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 3 %d...\n",in_parse_inline);
#endif
return;
}
@ -1529,6 +1623,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */
if (!(mode & Preformatted))
TrimSpaces( doc, element );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 4 %d...\n",in_parse_inline);
#endif
return; /* close <i>, but will re-open it, after </b> */
}
}
@ -1549,7 +1647,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 5 %d...\n",in_parse_inline);
#endif
return;
}
@ -1563,6 +1664,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
{
TY_(UngetToken)( doc );
TrimSpaces(doc, element);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 6 %d...\n",in_parse_inline);
#endif
return;
}
}
@ -1585,6 +1690,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 7 %d...\n",in_parse_inline);
#endif
return;
}
@ -1601,7 +1710,8 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
/* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
/* other fixes by Dave Raggett */
/* if (node->attributes == NULL) */
if (node->type != EndTag && node->attributes == NULL)
if (node->type != EndTag && node->attributes == NULL
&& cfgBool(doc, TidyCoerceEndTags) )
{
node->type = EndTag;
TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG);
@ -1617,6 +1727,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 8 %d...\n",in_parse_inline);
#endif
return;
}
@ -1739,6 +1853,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 9 %d...\n",in_parse_inline);
#endif
return;
}
}
@ -1754,7 +1872,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
TY_(FreeNode)( doc, node);
continue;
}
/* HTML5 */
if (nodeIsDATALIST(element)) {
TY_(ConstrainVersion)( doc, ~VERS_HTML5 );
} else
if (!(element->tag->model & CM_OPT))
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
@ -1776,6 +1897,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
{
TY_(DiscardElement)( doc, element );
TY_(UngetToken)( doc );
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 10 %d...\n",in_parse_inline);
#endif
return;
}
}
@ -1785,6 +1910,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
if (!(mode & Preformatted))
TrimSpaces(doc, element);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 11 %d...\n",in_parse_inline);
#endif
return;
}
@ -1812,6 +1941,10 @@ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
if (!(element->tag->model & CM_OPT))
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_inline--;
SPRTF("Exit ParseInline 12 %d...\n",in_parse_inline);
#endif
}
void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode)
@ -1824,7 +1957,7 @@ void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode)
{
if ( !(node->type == EndTag && node->tag == element->tag) )
{
TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY);
/* TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY); */
TY_(UngetToken)( doc );
}
else
@ -2895,10 +3028,17 @@ void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(m
void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
{
#if !defined(NDEBUG) && defined(_MSC_VER)
static int in_parse_select = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node;
lexer->insert = NULL; /* defer implicit inline start tags */
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_select++;
SPRTF("Entering ParseSelect %d...\n",in_parse_select);
#endif
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
@ -2907,6 +3047,10 @@ void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mod
TY_(FreeNode)( doc, node);
field->closed = yes;
TrimSpaces(doc, field);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_select--;
SPRTF("Exit ParseSelect 1 %d...\n",in_parse_select);
#endif
return;
}
@ -2917,6 +3061,7 @@ void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mod
if ( node->type == StartTag &&
( nodeIsOPTION(node) ||
nodeIsOPTGROUP(node) ||
nodeIsDATALIST(node) ||
nodeIsSCRIPT(node))
)
{
@ -2931,8 +3076,72 @@ void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mod
}
TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_select--;
SPRTF("Exit ParseSelect 2 %d...\n",in_parse_select);
#endif
}
/* HTML5 */
void TY_(ParseDatalist)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
{
#if !defined(NDEBUG) && defined(_MSC_VER)
static int in_parse_datalist = 0;
#endif
Lexer* lexer = doc->lexer;
Node *node;
lexer->insert = NULL; /* defer implicit inline start tags */
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_datalist++;
SPRTF("Entering ParseDatalist %d...\n",in_parse_datalist);
#endif
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == field->tag && node->type == EndTag)
{
TY_(FreeNode)( doc, node);
field->closed = yes;
TrimSpaces(doc, field);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_datalist--;
SPRTF("Exit ParseDatalist 1 %d...\n",in_parse_datalist);
#endif
return;
}
/* deal with comments etc. */
if (InsertMisc(field, node))
continue;
if ( node->type == StartTag &&
( nodeIsOPTION(node) ||
nodeIsOPTGROUP(node) ||
nodeIsDATALIST(node) ||
nodeIsSCRIPT(node))
)
{
TY_(InsertNodeAtEnd)(field, node);
ParseTag(doc, node, IgnoreWhitespace);
continue;
}
/* discard unexpected tags */
TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
}
TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
#if !defined(NDEBUG) && defined(_MSC_VER)
in_parse_datalist--;
SPRTF("Exit ParseDatalist 2 %d...\n",in_parse_datalist);
#endif
}
void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode)
{
Lexer* lexer = doc->lexer;
@ -3006,7 +3215,8 @@ void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode
Node *node;
while ((node = TY_(GetToken)(doc, MixedContent)) != NULL)
{
if (node->tag == title->tag && node->type == StartTag)
if (node->tag == title->tag && node->type == StartTag
&& cfgBool(doc, TidyCoerceEndTags) )
{
TY_(ReportError)(doc, title, node, COERCE_TO_ENDTAG);
node->type = EndTag;
@ -3129,6 +3339,9 @@ void TY_(ParseHead)(TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode))
int HasTitle = 0;
int HasBase = 0;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Enter ParseHead...\n");
#endif
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
{
if (node->tag == head->tag && node->type == EndTag)
@ -3214,10 +3427,6 @@ void TY_(ParseHead)(TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode))
head ?
TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
}
else if ( nodeIsNOSCRIPT(node) )
{
TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
}
#ifdef AUTO_INPUT_ENCODING
else if (nodeIsMETA(node))
@ -3271,6 +3480,9 @@ void TY_(ParseHead)(TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode))
TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
}
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Exit ParseHead 1...\n");
#endif
}
void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode)
@ -3283,6 +3495,9 @@ void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode)
checkstack = yes;
TY_(BumpObject)( doc, body->parent );
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Enter ParseBody...\n");
#endif
while ((node = TY_(GetToken)(doc, mode)) != NULL)
{
@ -3510,7 +3725,7 @@ void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode)
if (TY_(nodeIsElement)(node))
{
if ( TY_(nodeHasCM)(node, CM_INLINE) && !TY_(nodeHasCM)(node, CM_MIXED) )
if ( TY_(nodeHasCM)(node, CM_INLINE) )
{
/* HTML4 strict doesn't allow inline content here */
/* but HTML2 does allow img elements as children of body */
@ -3547,6 +3762,9 @@ void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode)
TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
TY_(FreeNode)( doc, node);
}
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Exit ParseBody 1...\n");
#endif
}
void TY_(ParseNoFrames)(TidyDocImpl* doc, Node *noframes, GetTokenMode mode)
@ -3735,6 +3953,9 @@ void TY_(ParseHTML)(TidyDocImpl* doc, Node *html, GetTokenMode mode)
Node *frameset = NULL;
Node *noframes = NULL;
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Entering ParseHTML...\n");
#endif
TY_(SetOptionBool)( doc, TidyXmlTags, no );
for (;;)
@ -3790,7 +4011,9 @@ void TY_(ParseHTML)(TidyDocImpl* doc, Node *html, GetTokenMode mode)
TY_(InsertNodeAtEnd)(html, node);
TY_(ParseBody)(doc, node, mode);
}
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Exit ParseHTML 1...\n");
#endif
return;
}
@ -3956,6 +4179,9 @@ void TY_(ParseHTML)(TidyDocImpl* doc, Node *html, GetTokenMode mode)
TY_(InsertNodeAtEnd)(html, node);
ParseTag(doc, node, mode);
#if !defined(NDEBUG) && defined(_MSC_VER)
SPRTF("Exit ParseHTML 2...\n");
#endif
}
static Bool nodeCMIsOnlyInline( Node* node )
@ -4048,7 +4274,9 @@ static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node)
{
next = node->next;
if (nodeIsDIR(node) || nodeIsMENU(node))
/* if (nodeIsDIR(node) || nodeIsMENU(node)) */
/* HTML5 - <menu ... > is no longer obsolete */
if (nodeIsDIR(node))
TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes);
if (nodeIsXMP(node) || nodeIsLISTING(node) ||

View file

@ -3,7 +3,7 @@
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include <stdio.h>
@ -1152,7 +1152,7 @@ static void PPrintAttribute( TidyDocImpl* doc, uint indent,
{
if ( TY_(IsScript)(doc, name) )
wrappable = cfgBool( doc, TidyWrapScriptlets );
else if (!(attrIsCONTENT(attr) || attrIsVALUE(attr) || attrIsALT(attr)) && wrapAttrs )
else if (!(attrIsCONTENT(attr) || attrIsVALUE(attr) || attrIsALT(attr) || attrIsTITLE(attr)) && wrapAttrs )
wrappable = yes;
}
@ -2083,7 +2083,8 @@ void TY_(PPrintTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node )
{
Bool indcont = ( cfgAutoBool(doc, TidyIndentContent) != TidyNoState );
Bool indsmart = ( cfgAutoBool(doc, TidyIndentContent) == TidyAutoState );
Bool hideend = cfgBool( doc, TidyHideEndTags );
Bool hideend = cfgBool( doc, TidyHideEndTags ) ||
cfgBool( doc, TidyOmitOptionalTags );
Bool classic = cfgBool( doc, TidyVertSpace );
uint contentIndent = indent;

View file

@ -269,6 +269,11 @@ Bool TIDY_CALL tidyNodeIsMENU( TidyNode tnod )
{ return nodeIsMENU( tidyNodeToImpl(tnod) );
}
/* HTML5 */
Bool TIDY_CALL tidyNodeIsDATALIST( TidyNode tnod )
{ return nodeIsDATALIST( tidyNodeToImpl(tnod) );
}
/*
* local variables:

1902
src/tags.c

File diff suppressed because it is too large Load diff

View file

@ -110,6 +110,7 @@ Parser TY_(ParseRow);
Parser TY_(ParseSelect);
Parser TY_(ParseOptGroup);
Parser TY_(ParseText);
Parser TY_(ParseDatalist);
CheckAttribs TY_(CheckAttributes);
@ -224,6 +225,11 @@ uint TY_(nodeHeaderLevel)( Node* node ); /* 1, 2, ..., 6 */
#define nodeIsU( node ) TagIsId( node, TidyTag_U )
#define nodeIsMENU( node ) TagIsId( node, TidyTag_MENU )
#define nodeIsBUTTON( node ) TagIsId( node, TidyTag_BUTTON )
#define nodeIsCANVAS( node ) TagIsId( node, TidyTag_CANVAS )
#define nodeIsPROGRESS( node ) TagIsId( node, TidyTag_PROGRESS )
/* HTML5 */
#define nodeIsDATALIST( node ) TagIsId( node, TidyTag_DATALIST )
#endif /* __TAGS_H__ */

View file

@ -4,14 +4,14 @@
See tidy.h for the copyright notice.
Defines HTML Tidy API implemented by tidy library.
Very rough initial cut for discussion purposes.
Public interface is const-correct and doesn't explicitly depend
on any globals. Thus, thread-safety may be introduced w/out
changing the interface.
Looking ahead to a C++ wrapper, C functions always pass
Looking ahead to a C++ wrapper, C functions always pass
this-equivalent as 1st arg.
Created 2001-05-20 by Charles Reitzel
@ -23,6 +23,7 @@
#include "tidy-int.h"
#include "parser.h"
#include "clean.h"
#include "gdoc.h"
#include "config.h"
#include "message.h"
#include "pprint.h"
@ -111,7 +112,7 @@ TidyOption tidyImplToOption( const TidyOptionImpl* option )
** 0 -> SUCCESS
** >0 -> WARNING
** <0 -> ERROR
**
**
*/
TidyDoc TIDY_CALL tidyCreate(void)
@ -622,8 +623,8 @@ Bool TIDY_CALL tidyOptCopyConfig( TidyDoc to, TidyDoc from )
/* I/O and Message handling interface
**
** By default, Tidy will define, create and use
** tdocances of input and output handlers for
** By default, Tidy will define, create and use
** tdocances of input and output handlers for
** standard C buffered I/O (i.e. FILE* stdin,
** FILE* stdout and FILE* stderr for content
** input, content output and diagnostic output,
@ -633,7 +634,7 @@ Bool TIDY_CALL tidyOptCopyConfig( TidyDoc to, TidyDoc from )
*/
/* Use TidyReportFilter to filter messages by diagnostic level:
** info, warning, etc. Just set diagnostic output
** info, warning, etc. Just set diagnostic output
** handler to redirect all diagnostics output. Return true
** to proceed with output, false to cancel.
*/
@ -792,7 +793,7 @@ uint TIDY_CALL tidyConfigErrorCount( TidyDoc tdoc )
}
/* Error reporting functions
/* Error reporting functions
*/
void TIDY_CALL tidyErrorSummary( TidyDoc tdoc )
{
@ -968,7 +969,7 @@ int tidyDocSaveFile( TidyDocImpl* doc, ctmbstr filnam )
if ( doc->errors > 0 &&
cfgBool(doc, TidyWriteBack) && !cfgBool(doc, TidyForceOutput) )
status = tidyDocStatus( doc );
else
else
fout = fopen( filnam, "wb" );
if ( fout )
@ -1002,7 +1003,7 @@ int tidyDocSaveFile( TidyDocImpl* doc, ctmbstr filnam )
** The code has been left in in case it works w/ other compilers
** or operating systems. If stdout is in Text mode, be aware that
** it will garble UTF16 documents. In text mode, when it encounters
** a single byte of value 10 (0xA), it will insert a single byte
** a single byte of value 10 (0xA), it will insert a single byte
** value 13 (0xD) just before it. This has the effect of garbling
** the entire document.
*/
@ -1067,7 +1068,7 @@ int tidyDocSaveString( TidyDocImpl* doc, tmbstr buffer, uint* buflen )
TidyBuffer outbuf;
StreamOut* out;
int status;
tidyBufInitWithAllocator( &outbuf, doc->allocator );
out = TY_(BufferOutput)( doc, &outbuf, outenc, nl );
status = tidyDocSaveStream( doc, out );
@ -1091,7 +1092,7 @@ int tidyDocSaveBuffer( TidyDocImpl* doc, TidyBuffer* outbuf )
uint outenc = cfg( doc, TidyOutCharEncoding );
uint nl = cfg( doc, TidyNewline );
StreamOut* out = TY_(BufferOutput)( doc, outbuf, outenc, nl );
status = tidyDocSaveStream( doc, out );
TidyDocFree( doc, out );
}
@ -1138,7 +1139,7 @@ int TIDY_CALL tidyRunDiagnostics( TidyDoc tdoc )
/* Workhorse functions.
**
** Parse requires input source, all input config items
** Parse requires input source, all input config items
** and diagnostic sink to have all been set before calling.
**
** Emit likewise requires that document sink and all
@ -1220,18 +1221,70 @@ int tidyDocRunDiagnostics( TidyDocImpl* doc )
TY_(ReportMarkupVersion)( doc );
TY_(ReportNumWarnings)( doc );
}
if ( doc->errors > 0 && !force )
TY_(NeedsAuthorIntervention)( doc );
return tidyDocStatus( doc );
}
static struct _html5Info
{
const char *tag;
uint id;
} const html5Info[] = {
{"acronym", TidyTag_ACRONYM},
{"applet", TidyTag_APPLET },
{"basefont",TidyTag_BASEFONT },
{ "big", TidyTag_BIG },
{ "center", TidyTag_CENTER },
{ "dir", TidyTag_DIR },
{ "font", TidyTag_FONT },
{ "frame", TidyTag_FRAME},
{ "frameset", TidyTag_FRAMESET},
{ "noframes", TidyTag_NOFRAMES },
{ "strike", TidyTag_STRIKE },
{ "tt", TidyTag_TT },
{ 0, 0 }
};
Bool inRemovedInfo( uint tid )
{
int i;
for (i = 0; ; i++) {
if (html5Info[i].tag == 0)
break;
if (html5Info[i].id == tid)
return yes;
}
return no;
}
void TY_(CheckHTML5)( TidyDocImpl* doc, Node* node )
{
Lexer* lexer = doc->lexer;
while (node)
{
if (TY_(nodeIsElement)(node)) {
if (node->tag) {
if ((!node->tag->versions & VERS_HTML5)||(inRemovedInfo(node->tag->id))) {
/* issue warning */
TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
}
}
}
if (node->content)
TY_(CheckHTML5)( doc, node->content );
node = node->next;
}
}
int tidyDocCleanAndRepair( TidyDocImpl* doc )
{
Bool word2K = cfgBool( doc, TidyWord2000 );
Bool logical = cfgBool( doc, TidyLogicalEmphasis );
Bool clean = cfgBool( doc, TidyMakeClean );
Bool gdoc = cfgBool( doc, TidyGDocClean );
Bool dropFont = cfgBool( doc, TidyDropFontTags );
Bool htmlOut = cfgBool( doc, TidyHtmlOut );
Bool xmlOut = cfgBool( doc, TidyXmlOut );
@ -1240,13 +1293,16 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
Bool tidyMark = cfgBool( doc, TidyMark );
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
ctmbstr sdef = NULL;
Node* node;
if (tidyXmlTags)
return tidyDocStatus( doc );
/* simplifies <b><b> ... </b> ...</b> etc. */
TY_(NestedEmphasis)( doc, &doc->root );
if ( mergeEmphasis )
TY_(NestedEmphasis)( doc, &doc->root );
/* cleans up <dir>indented text</dir> etc. */
TY_(List2BQ)( doc, &doc->root );
@ -1270,6 +1326,10 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
if ( clean || dropFont )
TY_(CleanDocument)( doc );
/* clean up html exported by Google Docs */
if ( gdoc )
TY_(CleanGoogleDocument)( doc );
/* Move terminating <br /> tags from out of paragraphs */
/*! Do we want to do this for all block-level elements? */
@ -1291,6 +1351,12 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
/* remember given doctype for reporting */
node = TY_(FindDocType)(doc);
sdef = tidyOptGetValue((TidyDoc)doc, TidyDoctype );
if (!sdef)
sdef = tidyOptGetCurrPick((TidyDoc) doc, TidyDoctypeMode );
if (sdef && (strcmp(sdef,"html5") == 0)) {
TY_(CheckHTML5)( doc, &doc->root );
}
if (node)
{
AttVal* fpi = TY_(GetAttrByName)(node, "PUBLIC");
@ -1388,7 +1454,6 @@ int tidyDocSaveStream( TidyDocImpl* doc, StreamOut* out )
{
/* noop */
TY_(DropFontElements)(doc, &doc->root, NULL);
TY_(WbrToSpace)(doc, &doc->root);
}
if ((makeClean && asciiChars) || makeBare)
@ -1439,8 +1504,8 @@ int tidyDocSaveStream( TidyDocImpl* doc, StreamOut* out )
**
** The big issue here is the degree to which we should mimic
** a DOM and/or SAX nodes.
**
** Is it 100% possible (and, if so, how difficult is it) to
**
** Is it 100% possible (and, if so, how difficult is it) to
** emit SAX events from this API? If SAX events are possible,
** is that 100% of data needed to build a DOM?
*/
@ -1571,7 +1636,7 @@ Bool TIDY_CALL tidyNodeGetText( TidyDoc tdoc, TidyNode tnod, TidyBuffer* outbuf
TY_(PFlushLine)( doc, 0 );
doc->docOut = NULL;
TidyDocFree( doc, out );
return yes;
}