/* lexer.c -- Lexer for html parser (c) 1998-2008 (W3C) MIT, ERCIM, Keio University See tidy.h for the copyright notice. */ /* Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2 NULL-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted to single space characters. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case. Not yet done: - Doctype subset and marked sections */ #include "tidy-int.h" #include "lexer.h" #include "parser.h" #include "entities.h" #include "streamio.h" #include "message.h" #include "tmbstr.h" #include "clean.h" #include "utf8.h" #include "streamio.h" #ifdef _MSC_VER #include "sprtf.h" #endif #ifndef SPRTF #define SPRTF printf #endif #if !defined(NDEBUG) && defined(_MSC_VER) /* #define DEBUG_ALLOCATION special EXTRA allocation debug information - VERY NOISY */ static void check_me(char *name); static Bool show_attrs = yes; #define MX_TXT 8 static char buffer[(MX_TXT*4)+8]; /* NOTE extra for '...'\0 tail */ static tmbstr get_text_string(Lexer* lexer, Node *node) { uint len = node->end - node->start; tmbstr cp = lexer->lexbuf + node->start; tmbstr end = lexer->lexbuf + node->end; unsigned char c; uint i = 0; Bool insp = no; if (len <= ((MX_TXT * 2) + 3)) { buffer[0] = 0; while (cp < end) { c = *cp; cp++; if (c == '\n') { buffer[i++] = '\\'; buffer[i++] = 'n'; } else if ( c == ' ' ) { if (!insp) buffer[i++] = c; insp = yes; } else { buffer[i++] = c; insp = no; } } } else { char *end1 = cp + MX_TXT; char *bgn = cp + (len - MX_TXT); buffer[0] = 0; if (bgn < end1) bgn = end1; while (cp < end1) { c = *cp; cp++; if (c == '\n') { buffer[i++] = '\\'; buffer[i++] = 'n'; } else if ( c == ' ' ) { if (!insp) buffer[i++] = c; insp = yes; } else { buffer[i++] = c; insp = no; } if (i >= MX_TXT) break; } c = '.'; if ((i < len)&&(cp < bgn)) { buffer[i++] = c; cp++; if ((i < len)&&(cp < bgn)) { buffer[i++] = c; cp++; if ((i < len)&&(cp < bgn)) { buffer[i++] = c; cp++; } } } cp = bgn; insp = no; while (cp < end) { c = *cp; cp++; if (c == '\n') { buffer[i++] = '\\'; buffer[i++] = 'n'; } else if ( c == ' ' ) { if (!insp) buffer[i++] = c; insp = yes; } else { buffer[i++] = c; insp = no; } } } buffer[i] = 0; return buffer; } static void Show_Node( TidyDocImpl* doc, const char *msg, Node *node ) { Lexer* lexer = doc->lexer; Bool lex = ((msg[0] == 'l')&&(msg[1] == 'e')) ? yes : no; int line = ( doc->lexer ? doc->lexer->lines : 0 ); int col = ( doc->lexer ? doc->lexer->columns : 0 ); tmbstr src = lex ? "lexer" : "stream"; SPRTF("R=%d C=%d: ", line, col ); /* DEBUG: Be able to set a TRAP on a SPECIFIC row,col */ if ((line == 67) && (col == 95)) { check_me("Show_Node"); /* just a debug trap */ } if (lexer && lexer->token && ((lexer->token->type == TextNode)||(node && (node->type == TextNode)))) { if (show_attrs) { uint len = node ? node->end - node->start : 0; tmbstr cp = node ? get_text_string( lexer, node ) : "NULL"; SPRTF("Returning %s TextNode [%s]%u %s\n", msg, cp, len, src ); } else { SPRTF("Returning %s TextNode %p... %s\n", msg, node, src ); } } else { tmbstr name = node ? node->element ? node->element : "blank" : "NULL"; if (show_attrs) { AttVal* av; SPRTF("Returning %s node <%s", msg, name); if (node) { for (av = node->attributes; av; av = av->next) { name = av->attribute; if (name) { SPRTF(" %s",name); if (av->value) { SPRTF("=\"%s\"", av->value); } } } } SPRTF("> %s\n", src); } else { SPRTF("Returning %s node %p <%s>... %s\n", msg, node, name, src ); } } } #define GTDBG(a,b,c) Show_Node(a,b,c) #else #define GTDBG(a,b,c) #endif /* Forward references */ /* swallows closing '>' */ static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty ); static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty, Node **asp, Node **php ); static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase, Bool *isempty, int *pdelim ); static Node *ParseDocTypeDecl(TidyDocImpl* doc); static void AddAttrToList( AttVal** list, AttVal* av ); /* used to classify characters for lexical purposes */ #define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0) static uint lexmap[128]; #define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name) #define IsValidXMLElemName(name) TY_(IsValidXMLID)(name) static struct _doctypes { uint score; uint vers; ctmbstr name; ctmbstr fpi; ctmbstr si; } const W3C_Doctypes[] = { { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, }, { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, }, { 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, }, { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, }, { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, }, { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, }, { 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" }, { 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" }, { 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" }, { 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" }, { 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" }, { 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" }, { 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" }, { 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" }, { 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" }, { 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" }, { 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" }, { 20, HT50, "HTML5", NULL, NULL }, { 21, XH50, "XHTML5", NULL, NULL }, /* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */ #if 0 { 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" }, { 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" }, #endif /* final entry */ { 0, 0, NULL, NULL, NULL } }; int TY_(HTMLVersion)(TidyDocImpl* doc) { uint i; uint j = 0; uint score = 0; uint vers = doc->lexer->versions; uint dtver = doc->lexer->doctype; TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode); Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) && !cfgBool(doc, TidyHtmlOut); Bool html4 = ((dtmode == TidyDoctypeStrict) || (dtmode == TidyDoctypeLoose) || (VERS_FROM40 & dtver) ? yes : no); Bool html5 = (!html4 && ((dtmode == TidyDoctypeAuto) || (dtmode == TidyDoctypeHtml5)) ? yes : no); if (xhtml && dtver == VERS_UNKNOWN) return XH50; if (dtver == VERS_UNKNOWN) return HT50; /* Issue #167 - if NOT XHTML, and doctype is default VERS_HTML5, then return HT50 */ if (!xhtml && (dtver == VERS_HTML5)) return HT50; /* Issue #377 - If xhtml and (doctype == html5) and constrained vers contains XH50 return that, and really if tidy defaults to 'html5', then maybe 'auto' should also apply! */ if (xhtml && html5 && ((vers & VERS_HTML5) == XH50)) return XH50; for (i = 0; W3C_Doctypes[i].name; ++i) { if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) || (html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers))) continue; if (vers & W3C_Doctypes[i].vers && (W3C_Doctypes[i].score < score || !score)) { score = W3C_Doctypes[i].score; j = i; } } if (score) return W3C_Doctypes[j].vers; return VERS_UNKNOWN; } static ctmbstr GetFPIFromVers(uint vers) { uint i; for (i = 0; W3C_Doctypes[i].name; ++i) if (W3C_Doctypes[i].vers == vers) return W3C_Doctypes[i].fpi; return NULL; } static ctmbstr GetSIFromVers(uint vers) { uint i; for (i = 0; W3C_Doctypes[i].name; ++i) if (W3C_Doctypes[i].vers == vers) return W3C_Doctypes[i].si; return NULL; } static ctmbstr GetNameFromVers(uint vers) { uint i; for (i = 0; W3C_Doctypes[i].name; ++i) if (W3C_Doctypes[i].vers == vers) return W3C_Doctypes[i].name; return NULL; } static uint GetVersFromFPI(ctmbstr fpi) { uint i; for (i = 0; W3C_Doctypes[i].name; ++i) if (W3C_Doctypes[i].fpi != NULL && TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0) return W3C_Doctypes[i].vers; return 0; } #if (defined(_MSC_VER) && !defined(NDEBUG)) /* Issue #377 - Output diminishing version bits */ typedef struct tagV2S { uint bit; ctmbstr val; }V2S, *PV2S; static V2S v2s[] = { { HT20, "HT20" }, { HT32, "HT32" }, { H40S, "H40S" }, { H40T, "H40T" }, { H40F, "H40F" }, { H41S, "H41S" }, { H41T, "H41T" }, { H41F, "H41F" }, { X10S, "X10S" }, { X10T, "X10T" }, { X10F, "X10F" }, { XH11, "XH11" }, { XB10, "XB10" }, /* 4096u */ /* { VERS_SUN, "VSUN" }, */ /* { VERS_NETSCAPE, "VNET" }, */ /* { VERS_MICROSOFT, "VMIC" }, 32768u */ { VERS_XML, "VXML" }, /* 65536u */ /* HTML5 */ { HT50, "HT50" }, /* 131072u */ { XH50, "XH50" }, /* 262144u */ { 0, 0 } }; /* Process the above table, adding a bit name, or '----' when not present */ static char *add_vers_string( tmbstr buf, uint vers ) { PV2S pv2s = v2s; int len = (int)strlen(buf); while (pv2s->val) { if (vers & pv2s->bit) { if (len) { strcat(buf,"|"); len++; } strcat(buf,pv2s->val); len += (int)strlen(pv2s->val); vers &= ~(pv2s->bit); if (!vers) break; } else { if (len) { strcat(buf,"|"); len++; } strcat(buf,"----"); len += 4; } pv2s++; } if (vers) { /* Should not have any here! */ if (len) strcat(buf,"|"); sprintf(EndBuf(buf),"%u",vers); } return buf; } /* Issue #377 - Show first Before: list, and then on any change Note the VERS_PROPRIETARY are exclude since they always remain */ void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers) { static char vcur[256]; static Bool dnfirst = no; uint curr = doc->lexer->versions; /* get current */ doc->lexer->versions &= (vers | VERS_PROPRIETARY); if (curr != doc->lexer->versions) { /* only if different */ if (!dnfirst) { dnfirst = yes; vcur[0] = 0; curr &= ~(VERS_PROPRIETARY); add_vers_string( vcur, curr ); SPRTF("Before: %s\n", vcur); } vcur[0] = 0; curr = doc->lexer->versions; curr &= ~(VERS_PROPRIETARY); add_vers_string( vcur, curr ); SPRTF("After : %s\n", vcur); } } #else /* !#if (defined(_MSC_VER) && !defined(NDEBUG)) */ /* everything is allowed in proprietary version of HTML */ /* this is handled here rather than in the tag/attr dicts */ void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers) { doc->lexer->versions &= (vers | VERS_PROPRIETARY); } #endif /* #if (defined(_MSC_VER) && !defined(NDEBUG)) y/n */ Bool TY_(IsWhite)(uint c) { uint map = MAP(c); return (map & white)!=0; } Bool TY_(IsNewline)(uint c) { uint map = MAP(c); return (map & newline)!=0; } Bool TY_(IsDigit)(uint c) { uint map; map = MAP(c); return (map & digit)!=0; } static Bool IsDigitHex(uint c) { uint map; map = MAP(c); return (map & digithex)!=0; } Bool TY_(IsLetter)(uint c) { uint map; map = MAP(c); return (map & letter)!=0; } Bool TY_(IsHTMLSpace)(uint c) { return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d; } Bool TY_(IsNamechar)(uint c) { uint map = MAP(c); return (map & namechar)!=0; } Bool TY_(IsXMLLetter)(uint c) { return ((c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a) || (c >= 0xc0 && c <= 0xd6) || (c >= 0xd8 && c <= 0xf6) || (c >= 0xf8 && c <= 0xff) || (c >= 0x100 && c <= 0x131) || (c >= 0x134 && c <= 0x13e) || (c >= 0x141 && c <= 0x148) || (c >= 0x14a && c <= 0x17e) || (c >= 0x180 && c <= 0x1c3) || (c >= 0x1cd && c <= 0x1f0) || (c >= 0x1f4 && c <= 0x1f5) || (c >= 0x1fa && c <= 0x217) || (c >= 0x250 && c <= 0x2a8) || (c >= 0x2bb && c <= 0x2c1) || c == 0x386 || (c >= 0x388 && c <= 0x38a) || c == 0x38c || (c >= 0x38e && c <= 0x3a1) || (c >= 0x3a3 && c <= 0x3ce) || (c >= 0x3d0 && c <= 0x3d6) || c == 0x3da || c == 0x3dc || c == 0x3de || c == 0x3e0 || (c >= 0x3e2 && c <= 0x3f3) || (c >= 0x401 && c <= 0x40c) || (c >= 0x40e && c <= 0x44f) || (c >= 0x451 && c <= 0x45c) || (c >= 0x45e && c <= 0x481) || (c >= 0x490 && c <= 0x4c4) || (c >= 0x4c7 && c <= 0x4c8) || (c >= 0x4cb && c <= 0x4cc) || (c >= 0x4d0 && c <= 0x4eb) || (c >= 0x4ee && c <= 0x4f5) || (c >= 0x4f8 && c <= 0x4f9) || (c >= 0x531 && c <= 0x556) || c == 0x559 || (c >= 0x561 && c <= 0x586) || (c >= 0x5d0 && c <= 0x5ea) || (c >= 0x5f0 && c <= 0x5f2) || (c >= 0x621 && c <= 0x63a) || (c >= 0x641 && c <= 0x64a) || (c >= 0x671 && c <= 0x6b7) || (c >= 0x6ba && c <= 0x6be) || (c >= 0x6c0 && c <= 0x6ce) || (c >= 0x6d0 && c <= 0x6d3) || c == 0x6d5 || (c >= 0x6e5 && c <= 0x6e6) || (c >= 0x905 && c <= 0x939) || c == 0x93d || (c >= 0x958 && c <= 0x961) || (c >= 0x985 && c <= 0x98c) || (c >= 0x98f && c <= 0x990) || (c >= 0x993 && c <= 0x9a8) || (c >= 0x9aa && c <= 0x9b0) || c == 0x9b2 || (c >= 0x9b6 && c <= 0x9b9) || (c >= 0x9dc && c <= 0x9dd) || (c >= 0x9df && c <= 0x9e1) || (c >= 0x9f0 && c <= 0x9f1) || (c >= 0xa05 && c <= 0xa0a) || (c >= 0xa0f && c <= 0xa10) || (c >= 0xa13 && c <= 0xa28) || (c >= 0xa2a && c <= 0xa30) || (c >= 0xa32 && c <= 0xa33) || (c >= 0xa35 && c <= 0xa36) || (c >= 0xa38 && c <= 0xa39) || (c >= 0xa59 && c <= 0xa5c) || c == 0xa5e || (c >= 0xa72 && c <= 0xa74) || (c >= 0xa85 && c <= 0xa8b) || c == 0xa8d || (c >= 0xa8f && c <= 0xa91) || (c >= 0xa93 && c <= 0xaa8) || (c >= 0xaaa && c <= 0xab0) || (c >= 0xab2 && c <= 0xab3) || (c >= 0xab5 && c <= 0xab9) || c == 0xabd || c == 0xae0 || (c >= 0xb05 && c <= 0xb0c) || (c >= 0xb0f && c <= 0xb10) || (c >= 0xb13 && c <= 0xb28) || (c >= 0xb2a && c <= 0xb30) || (c >= 0xb32 && c <= 0xb33) || (c >= 0xb36 && c <= 0xb39) || c == 0xb3d || (c >= 0xb5c && c <= 0xb5d) || (c >= 0xb5f && c <= 0xb61) || (c >= 0xb85 && c <= 0xb8a) || (c >= 0xb8e && c <= 0xb90) || (c >= 0xb92 && c <= 0xb95) || (c >= 0xb99 && c <= 0xb9a) || c == 0xb9c || (c >= 0xb9e && c <= 0xb9f) || (c >= 0xba3 && c <= 0xba4) || (c >= 0xba8 && c <= 0xbaa) || (c >= 0xbae && c <= 0xbb5) || (c >= 0xbb7 && c <= 0xbb9) || (c >= 0xc05 && c <= 0xc0c) || (c >= 0xc0e && c <= 0xc10) || (c >= 0xc12 && c <= 0xc28) || (c >= 0xc2a && c <= 0xc33) || (c >= 0xc35 && c <= 0xc39) || (c >= 0xc60 && c <= 0xc61) || (c >= 0xc85 && c <= 0xc8c) || (c >= 0xc8e && c <= 0xc90) || (c >= 0xc92 && c <= 0xca8) || (c >= 0xcaa && c <= 0xcb3) || (c >= 0xcb5 && c <= 0xcb9) || c == 0xcde || (c >= 0xce0 && c <= 0xce1) || (c >= 0xd05 && c <= 0xd0c) || (c >= 0xd0e && c <= 0xd10) || (c >= 0xd12 && c <= 0xd28) || (c >= 0xd2a && c <= 0xd39) || (c >= 0xd60 && c <= 0xd61) || (c >= 0xe01 && c <= 0xe2e) || c == 0xe30 || (c >= 0xe32 && c <= 0xe33) || (c >= 0xe40 && c <= 0xe45) || (c >= 0xe81 && c <= 0xe82) || c == 0xe84 || (c >= 0xe87 && c <= 0xe88) || c == 0xe8a || c == 0xe8d || (c >= 0xe94 && c <= 0xe97) || (c >= 0xe99 && c <= 0xe9f) || (c >= 0xea1 && c <= 0xea3) || c == 0xea5 || c == 0xea7 || (c >= 0xeaa && c <= 0xeab) || (c >= 0xead && c <= 0xeae) || c == 0xeb0 || (c >= 0xeb2 && c <= 0xeb3) || c == 0xebd || (c >= 0xec0 && c <= 0xec4) || (c >= 0xf40 && c <= 0xf47) || (c >= 0xf49 && c <= 0xf69) || (c >= 0x10a0 && c <= 0x10c5) || (c >= 0x10d0 && c <= 0x10f6) || c == 0x1100 || (c >= 0x1102 && c <= 0x1103) || (c >= 0x1105 && c <= 0x1107) || c == 0x1109 || (c >= 0x110b && c <= 0x110c) || (c >= 0x110e && c <= 0x1112) || c == 0x113c || c == 0x113e || c == 0x1140 || c == 0x114c || c == 0x114e || c == 0x1150 || (c >= 0x1154 && c <= 0x1155) || c == 0x1159 || (c >= 0x115f && c <= 0x1161) || c == 0x1163 || c == 0x1165 || c == 0x1167 || c == 0x1169 || (c >= 0x116d && c <= 0x116e) || (c >= 0x1172 && c <= 0x1173) || c == 0x1175 || c == 0x119e || c == 0x11a8 || c == 0x11ab || (c >= 0x11ae && c <= 0x11af) || (c >= 0x11b7 && c <= 0x11b8) || c == 0x11ba || (c >= 0x11bc && c <= 0x11c2) || c == 0x11eb || c == 0x11f0 || c == 0x11f9 || (c >= 0x1e00 && c <= 0x1e9b) || (c >= 0x1ea0 && c <= 0x1ef9) || (c >= 0x1f00 && c <= 0x1f15) || (c >= 0x1f18 && c <= 0x1f1d) || (c >= 0x1f20 && c <= 0x1f45) || (c >= 0x1f48 && c <= 0x1f4d) || (c >= 0x1f50 && c <= 0x1f57) || c == 0x1f59 || c == 0x1f5b || c == 0x1f5d || (c >= 0x1f5f && c <= 0x1f7d) || (c >= 0x1f80 && c <= 0x1fb4) || (c >= 0x1fb6 && c <= 0x1fbc) || c == 0x1fbe || (c >= 0x1fc2 && c <= 0x1fc4) || (c >= 0x1fc6 && c <= 0x1fcc) || (c >= 0x1fd0 && c <= 0x1fd3) || (c >= 0x1fd6 && c <= 0x1fdb) || (c >= 0x1fe0 && c <= 0x1fec) || (c >= 0x1ff2 && c <= 0x1ff4) || (c >= 0x1ff6 && c <= 0x1ffc) || c == 0x2126 || (c >= 0x212a && c <= 0x212b) || c == 0x212e || (c >= 0x2180 && c <= 0x2182) || (c >= 0x3041 && c <= 0x3094) || (c >= 0x30a1 && c <= 0x30fa) || (c >= 0x3105 && c <= 0x312c) || (c >= 0xac00 && c <= 0xd7a3) || (c >= 0x4e00 && c <= 0x9fa5) || c == 0x3007 || (c >= 0x3021 && c <= 0x3029) || (c >= 0x4e00 && c <= 0x9fa5) || c == 0x3007 || (c >= 0x3021 && c <= 0x3029)); } Bool TY_(IsXMLNamechar)(uint c) { return (TY_(IsXMLLetter)(c) || c == '.' || c == '_' || c == ':' || c == '-' || (c >= 0x300 && c <= 0x345) || (c >= 0x360 && c <= 0x361) || (c >= 0x483 && c <= 0x486) || (c >= 0x591 && c <= 0x5a1) || (c >= 0x5a3 && c <= 0x5b9) || (c >= 0x5bb && c <= 0x5bd) || c == 0x5bf || (c >= 0x5c1 && c <= 0x5c2) || c == 0x5c4 || (c >= 0x64b && c <= 0x652) || c == 0x670 || (c >= 0x6d6 && c <= 0x6dc) || (c >= 0x6dd && c <= 0x6df) || (c >= 0x6e0 && c <= 0x6e4) || (c >= 0x6e7 && c <= 0x6e8) || (c >= 0x6ea && c <= 0x6ed) || (c >= 0x901 && c <= 0x903) || c == 0x93c || (c >= 0x93e && c <= 0x94c) || c == 0x94d || (c >= 0x951 && c <= 0x954) || (c >= 0x962 && c <= 0x963) || (c >= 0x981 && c <= 0x983) || c == 0x9bc || c == 0x9be || c == 0x9bf || (c >= 0x9c0 && c <= 0x9c4) || (c >= 0x9c7 && c <= 0x9c8) || (c >= 0x9cb && c <= 0x9cd) || c == 0x9d7 || (c >= 0x9e2 && c <= 0x9e3) || c == 0xa02 || c == 0xa3c || c == 0xa3e || c == 0xa3f || (c >= 0xa40 && c <= 0xa42) || (c >= 0xa47 && c <= 0xa48) || (c >= 0xa4b && c <= 0xa4d) || (c >= 0xa70 && c <= 0xa71) || (c >= 0xa81 && c <= 0xa83) || c == 0xabc || (c >= 0xabe && c <= 0xac5) || (c >= 0xac7 && c <= 0xac9) || (c >= 0xacb && c <= 0xacd) || (c >= 0xb01 && c <= 0xb03) || c == 0xb3c || (c >= 0xb3e && c <= 0xb43) || (c >= 0xb47 && c <= 0xb48) || (c >= 0xb4b && c <= 0xb4d) || (c >= 0xb56 && c <= 0xb57) || (c >= 0xb82 && c <= 0xb83) || (c >= 0xbbe && c <= 0xbc2) || (c >= 0xbc6 && c <= 0xbc8) || (c >= 0xbca && c <= 0xbcd) || c == 0xbd7 || (c >= 0xc01 && c <= 0xc03) || (c >= 0xc3e && c <= 0xc44) || (c >= 0xc46 && c <= 0xc48) || (c >= 0xc4a && c <= 0xc4d) || (c >= 0xc55 && c <= 0xc56) || (c >= 0xc82 && c <= 0xc83) || (c >= 0xcbe && c <= 0xcc4) || (c >= 0xcc6 && c <= 0xcc8) || (c >= 0xcca && c <= 0xccd) || (c >= 0xcd5 && c <= 0xcd6) || (c >= 0xd02 && c <= 0xd03) || (c >= 0xd3e && c <= 0xd43) || (c >= 0xd46 && c <= 0xd48) || (c >= 0xd4a && c <= 0xd4d) || c == 0xd57 || c == 0xe31 || (c >= 0xe34 && c <= 0xe3a) || (c >= 0xe47 && c <= 0xe4e) || c == 0xeb1 || (c >= 0xeb4 && c <= 0xeb9) || (c >= 0xebb && c <= 0xebc) || (c >= 0xec8 && c <= 0xecd) || (c >= 0xf18 && c <= 0xf19) || c == 0xf35 || c == 0xf37 || c == 0xf39 || c == 0xf3e || c == 0xf3f || (c >= 0xf71 && c <= 0xf84) || (c >= 0xf86 && c <= 0xf8b) || (c >= 0xf90 && c <= 0xf95) || c == 0xf97 || (c >= 0xf99 && c <= 0xfad) || (c >= 0xfb1 && c <= 0xfb7) || c == 0xfb9 || (c >= 0x20d0 && c <= 0x20dc) || c == 0x20e1 || (c >= 0x302a && c <= 0x302f) || c == 0x3099 || c == 0x309a || (c >= 0x30 && c <= 0x39) || (c >= 0x660 && c <= 0x669) || (c >= 0x6f0 && c <= 0x6f9) || (c >= 0x966 && c <= 0x96f) || (c >= 0x9e6 && c <= 0x9ef) || (c >= 0xa66 && c <= 0xa6f) || (c >= 0xae6 && c <= 0xaef) || (c >= 0xb66 && c <= 0xb6f) || (c >= 0xbe7 && c <= 0xbef) || (c >= 0xc66 && c <= 0xc6f) || (c >= 0xce6 && c <= 0xcef) || (c >= 0xd66 && c <= 0xd6f) || (c >= 0xe50 && c <= 0xe59) || (c >= 0xed0 && c <= 0xed9) || (c >= 0xf20 && c <= 0xf29) || c == 0xb7 || c == 0x2d0 || c == 0x2d1 || c == 0x387 || c == 0x640 || c == 0xe46 || c == 0xec6 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) || (c >= 0x309d && c <= 0x309e) || (c >= 0x30fc && c <= 0x30fe)); } #if 0 Bool IsLower(uint c) { uint map = MAP(c); return (map & lowercase)!=0; } #endif Bool TY_(IsUpper)(uint c) { uint map = MAP(c); return (map & uppercase)!=0; } uint TY_(ToLower)(uint c) { uint map = MAP(c); if (map & uppercase) c += 'a' - 'A'; return c; } uint TY_(ToUpper)(uint c) { uint map = MAP(c); if (map & lowercase) c += (uint) ('A' - 'a' ); return c; } #if 0 char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps ) { if ( !cfgBool(doc, TidyXmlTags) ) { if ( tocaps ) { c = (tmbchar) ToUpper(c); } else /* force to lower case */ { c = (tmbchar) ToLower(c); } } return c; } #endif /* return last character in string this is useful when trailing quotemark is missing on an attribute */ static tmbchar LastChar( tmbstr str ) { if ( str && *str ) { int n = TY_(tmbstrlen)(str); return str[n-1]; } return 0; } /* node->type is one of these: #define TextNode 1 #define StartTag 2 #define EndTag 3 #define StartEndTag 4 */ Lexer* TY_(NewLexer)( TidyDocImpl* doc ) { Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) ); if ( lexer != NULL ) { TidyClearMemory( lexer, sizeof(Lexer) ); lexer->allocator = doc->allocator; lexer->lines = 1; lexer->columns = 1; lexer->state = LEX_CONTENT; lexer->versions = (VERS_ALL|VERS_PROPRIETARY); lexer->doctype = VERS_UNKNOWN; lexer->root = &doc->root; } return lexer; } static Bool EndOfInput( TidyDocImpl* doc ) { assert( doc->docIn != NULL ); return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) ); } void TY_(FreeLexer)( TidyDocImpl* doc ) { Lexer *lexer = doc->lexer; if ( lexer ) { TY_(FreeStyles)( doc ); /* See GetToken() */ if ( lexer->pushed || lexer->itoken ) { if (lexer->pushed) TY_(FreeNode)( doc, lexer->itoken ); TY_(FreeNode)( doc, lexer->token ); } while ( lexer->istacksize > 0 ) TY_(PopInline)( doc, NULL ); TidyDocFree( doc, lexer->istack ); TidyDocFree( doc, lexer->lexbuf ); TidyDocFree( doc, lexer ); doc->lexer = NULL; } } /* Lexer uses bigger memory chunks than pprint as ** it must hold the entire input document. not just ** the last line or three. */ static void AddByte( Lexer *lexer, tmbchar ch ) { if ( lexer->lexsize + 2 >= lexer->lexlength ) { tmbstr buf = NULL; uint allocAmt = lexer->lexlength; while ( lexer->lexsize + 2 >= allocAmt ) { if ( allocAmt == 0 ) allocAmt = 8192; else allocAmt *= 2; } buf = (tmbstr) TidyRealloc( lexer->allocator, lexer->lexbuf, allocAmt ); if ( buf ) { TidyClearMemory( buf + lexer->lexlength, allocAmt - lexer->lexlength ); lexer->lexbuf = buf; lexer->lexlength = allocAmt; } } lexer->lexbuf[ lexer->lexsize++ ] = ch; lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */ } static void ChangeChar( Lexer *lexer, tmbchar c ) { if ( lexer->lexsize > 0 ) { lexer->lexbuf[ lexer->lexsize-1 ] = c; } } /* store character c as UTF-8 encoded byte stream */ void TY_(AddCharToLexer)( Lexer *lexer, uint c ) { int i, err, count = 0; tmbchar buf[10] = {0}; err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count ); if (err) { #if 0 && defined(_DEBUG) fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c ); #endif /* replacement character 0xFFFD encoded as UTF-8 */ buf[0] = (byte) 0xEF; buf[1] = (byte) 0xBF; buf[2] = (byte) 0xBD; count = 3; } for ( i = 0; i < count; ++i ) AddByte( lexer, buf[i] ); } static void AddStringToLexer( Lexer *lexer, ctmbstr str ) { uint c; /* Many (all?) compilers will sign-extend signed chars (the default) when ** converting them to unsigned integer values. We must cast our char to ** unsigned char before assigning it to prevent this from happening. */ while( 0 != (c = (unsigned char) *str++ )) TY_(AddCharToLexer)( lexer, c ); } static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer ) { lexer->lines = doc->docIn->curline; lexer->columns = doc->docIn->curcol; } /* Have detected the first of a surrogate pair... Try to find, decode the second... Already have '&' start... */ static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch) { Lexer* lexer = doc->lexer; uint bufSize = 32; uint c, i, ch, offset = 0; tmbstr buf = 0; Bool success = no; /* assume failed */ int type = 0; /* assume numeric */ uint fch = *pch; if (!lexer) return no; buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize); if (!buf) return no; while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream ) { if (c == ';') { break; /* reached end of entity */ } if ((offset + 2) > bufSize) { bufSize *= 2; buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize); if (!buf) { break; } } buf[offset++] = c; /* add char to buffer */ if (offset == 1) { if (c != '#') /* is a numeric entity */ break; } else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X'))) { type = 1; /* set hex digits */ } else { if (type) /* if hex digits */ { if (!IsDigitHex(c)) break; } else /* if numeric */ { if (!TY_(IsDigit)(c)) break; } } } if (c == ';') { buf[offset] = 0; if (type) sscanf(buf + 2, "%x", &ch); else sscanf(buf + 1, "%d", &ch); if (TY_(IsHighSurrogate)(ch)) { ch = TY_(CombineSurrogatePair)(ch, fch); if (TY_(IsValidCombinedChar)(ch)) { *pch = ch; /* return combined pair value */ success = yes; } } } if (!success) { if (ch == ';') TY_(UngetChar)(ch, doc->docIn); if (buf) { for (i = 0; i < offset; i++) { c = buf[i]; TY_(UngetChar)(c, doc->docIn); } } } if (buf) TidyFree(lexer->allocator, buf); return success; } /* No longer attempts to insert missing ';' for unknown enitities unless one was present already, since this gives unexpected results. For example: was tidied to: rather than: My thanks for Maurice Buxton for spotting this. Also Randy Waki pointed out the following case for the 04 Aug 00 version (bug #433012): For example: was tidied to: rather than: where "lang" is a known entity (#9001), but browsers would misinterpret "⟨" because it had a value > 256. So the case of an apparently known entity with a value > 256 and missing a semicolon is handled specially. "ParseEntity" is also a bit of a misnomer - it handles entities and numeric character references. Invalid NCR's are now reported. */ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode ) { typedef enum { ENT_default, ENT_numdec, ENT_numhex } ENTState; typedef Bool (*ENTfn)(uint); const ENTfn entFn[] = { TY_(IsNamechar), TY_(IsDigit), IsDigitHex }; uint start; ENTState entState = ENT_default; uint charRead = 0; Bool semicolon = no, found = no; Bool isXml = cfgBool( doc, TidyXmlTags ); Bool preserveEntities = cfgBool( doc, TidyPreserveEntities ); uint c, ch, startcol, entver = 0; Lexer* lexer = doc->lexer; start = lexer->lexsize - 1; /* to start at "&" */ startcol = doc->docIn->curcol - 1; while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream ) { if ( c == ';' ) { semicolon = yes; break; } ++charRead; if (charRead == 1 && c == '#') { #if SUPPORT_ASIAN_ENCODINGS if ( !cfgBool(doc, TidyNCR) || cfg(doc, TidyInCharEncoding) == BIG5 || cfg(doc, TidyInCharEncoding) == SHIFTJIS ) { TY_(UngetChar)('#', doc->docIn); return; } #endif TY_(AddCharToLexer)( lexer, c ); entState = ENT_numdec; continue; } else if (charRead == 2 && entState == ENT_numdec && (c == 'x' || (!isXml && c == 'X')) ) { TY_(AddCharToLexer)( lexer, c ); entState = ENT_numhex; continue; } if ( entFn[entState](c) ) { TY_(AddCharToLexer)( lexer, c ); continue; } /* otherwise put it back */ TY_(UngetChar)( c, doc->docIn ); break; } /* make sure entity is NULL terminated */ lexer->lexbuf[lexer->lexsize] = '\0'; /* Should contrain version to XML/XHTML if ' ** is encountered. But this is not possible with ** Tidy's content model bit mask. */ if ( TY_(tmbstrcmp)(lexer->lexbuf+start, "&apos") == 0 && !cfgBool(doc, TidyXmlOut) && !lexer->isvoyager && !cfgBool(doc, TidyXhtmlOut) && !(TY_(HTMLVersion)(doc) == HT50) ) /* Issue #239 - no warning if in HTML5++ mode */ TY_(ReportEntityError)( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 ); if (( mode == OtherNamespace ) && ( c == ';' )) { /* #130 MathML attr and entity fix! */ found = yes; ch = 255; entver = XH50|HT50; preserveEntities = yes; } else { /* Lookup entity code and version */ found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver ); } /* Issue #483 - Deal with 'surrogate pairs' */ /* TODO: Maybe warning/error, like found a leading surrogate but no following surrogate! Maybe should avoid outputting invalid utf-8 for this entity - maybe substitute? */ if (!preserveEntities && found && TY_(IsLowSurrogate)(ch)) { uint c1; if ((c1 = TY_(ReadChar)(doc->docIn)) == '&') { /* have a following entity */ if (!GetSurrogatePair(doc, isXml, &ch)) { TY_(UngetChar)(c1, doc->docIn); /* otherwise put it back */ } } else { /* otherwise put it back */ TY_(UngetChar)(c1, doc->docIn); } } /* deal with unrecognized or invalid entities */ /* #433012 - fix by Randy Waki 17 Feb 01 */ /* report invalid NCR's - Terry Teague 01 Sep 01 */ if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') ) { /* set error position just before offending character */ SetLexerLocus( doc, lexer ); lexer->columns = startcol; if (lexer->lexsize > start + 1) { if (ch >= 128 && ch <= 159) { /* invalid numeric character reference */ uint c1 = 0; int replaceMode = DISCARDED_CHAR; /* Always assume Win1252 in this circumstance. */ c1 = TY_(DecodeWin1252)( ch ); if ( c1 ) replaceMode = REPLACED_CHAR; if ( c != ';' ) /* issue warning if not terminated by ';' */ TY_(ReportEntityError)( doc, MISSING_SEMICOLON_NCR, lexer->lexbuf+start, c ); TY_(ReportEncodingError)(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR); if ( c1 ) { /* make the replacement */ lexer->lexsize = start; TY_(AddCharToLexer)( lexer, c1 ); semicolon = no; } else { /* discard */ lexer->lexsize = start; semicolon = no; } } else TY_(ReportEntityError)( doc, UNKNOWN_ENTITY, lexer->lexbuf+start, ch ); if (semicolon) TY_(AddCharToLexer)( lexer, ';' ); } else { /*\ * Issue #207 - A naked & is allowed in HTML5, as an unambiguous ampersand! \*/ if (TY_(HTMLVersion)(doc) != HT50) { TY_(ReportEntityError)( doc, UNESCAPED_AMPERSAND, lexer->lexbuf+start, ch ); } } } else { if ( c != ';' ) /* issue warning if not terminated by ';' */ { /* set error position just before offending chararcter */ SetLexerLocus( doc, lexer ); lexer->columns = startcol; TY_(ReportEntityError)( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c ); } if (preserveEntities) TY_(AddCharToLexer)( lexer, ';' ); else { lexer->lexsize = start; if ( ch == 160 && (mode == Preformatted) ) ch = ' '; TY_(AddCharToLexer)( lexer, ch ); if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) ) AddStringToLexer( lexer, "amp;" ); } /* Detect extended vs. basic entities */ TY_(ConstrainVersion)( doc, entver ); } } static tmbchar ParseTagName( TidyDocImpl* doc ) { Lexer *lexer = doc->lexer; uint c = lexer->lexbuf[ lexer->txtstart ]; Bool xml = cfgBool(doc, TidyXmlTags); /* fold case of first character in buffer */ if (!xml && TY_(IsUpper)(c)) lexer->lexbuf[lexer->txtstart] = (tmbchar) TY_(ToLower)(c); while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) { if ((!xml && !TY_(IsNamechar)(c)) || (xml && !TY_(IsXMLNamechar)(c))) break; /* fold case of subsequent characters */ if (!xml && TY_(IsUpper)(c)) c = TY_(ToLower)(c); TY_(AddCharToLexer)(lexer, c); } lexer->txtend = lexer->lexsize; return (tmbchar) c; } /* Used for elements and text nodes element name is NULL for text nodes start and end are offsets into lexbuf which contains the textual content of all elements in the parse tree. parent and content allow traversal of the parse tree in any direction. attributes are represented as a linked list of AttVal nodes which hold the strings for attribute/value pairs. */ Node *TY_(NewNode)(TidyAllocator* allocator, Lexer *lexer) { Node* node = (Node*) TidyAlloc( allocator, sizeof(Node) ); TidyClearMemory( node, sizeof(Node) ); if ( lexer ) { node->line = lexer->lines; node->column = lexer->columns; } node->type = TextNode; #if !defined(NDEBUG) && defined(_MSC_VER) && defined(DEBUG_ALLOCATION) SPRTF("Allocated node %p\n", node ); #endif return node; } /* used to clone heading nodes when split by an
*/ Node *TY_(CloneNode)( TidyDocImpl* doc, Node *element ) { Lexer* lexer = doc->lexer; Node *node = TY_(NewNode)( lexer->allocator, lexer ); node->start = lexer->lexsize; node->end = lexer->lexsize; if ( element ) { node->parent = element->parent; node->type = element->type; node->closed = element->closed; node->implicit = element->implicit; node->tag = element->tag; node->element = TY_(tmbstrdup)( doc->allocator, element->element ); node->attributes = TY_(DupAttrs)( doc, element->attributes ); } return node; } /* free node's attributes */ void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ) { while ( node->attributes ) { AttVal *av = node->attributes; if ( av->attribute ) { if ( (attrIsID(av) || attrIsNAME(av)) && TY_(IsAnchorElement)(doc, node) ) { TY_(RemoveAnchorByNode)( doc, av->value, node ); } } node->attributes = av->next; TY_(FreeAttribute)( doc, av ); } } /* doesn't repair attribute list linkage */ void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av ) { TY_(FreeNode)( doc, av->asp ); TY_(FreeNode)( doc, av->php ); TidyDocFree( doc, av->attribute ); TidyDocFree( doc, av->value ); TidyDocFree( doc, av ); } /* detach attribute from node */ void TY_(DetachAttribute)( Node *node, AttVal *attr ) { AttVal *av, *prev = NULL; for ( av = node->attributes; av; av = av->next ) { if ( av == attr ) { if ( prev ) prev->next = attr->next; else node->attributes = attr->next; break; } prev = av; } } /* detach attribute from node then free it */ void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr ) { TY_(DetachAttribute)( node, attr ); TY_(FreeAttribute)( doc, attr ); } /* Free document nodes by iterating through peers and recursing through children. Set next to NULL before calling TY_(FreeNode)() to avoid freeing peer nodes. Doesn't patch up prev/next links. */ void TY_(FreeNode)( TidyDocImpl* doc, Node *node ) { #if !defined(NDEBUG) && defined(_MSC_VER) && defined(DEBUG_ALLOCATION) if (node) SPRTF("Free node %p\n", node ); #endif /* this is no good ;=(( if (node && doc && doc->lexer) { if (node == doc->lexer->token) { doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer ); } } ----------------- */ while ( node ) { Node* next = node->next; TY_(FreeAttrs)( doc, node ); TY_(FreeNode)( doc, node->content ); TidyDocFree( doc, node->element ); #ifdef TIDY_STORE_ORIGINAL_TEXT if (node->otext) TidyDocFree(doc, node->otext); #endif if (RootNode != node->type) TidyDocFree( doc, node ); else node->content = NULL; node = next; } } #ifdef TIDY_STORE_ORIGINAL_TEXT void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count) { if (!doc->storeText) return; if (count >= doc->docIn->otextlen) return; if (!doc->docIn->otextsize) return; if (count == 0) { node->otext = doc->docIn->otextbuf; doc->docIn->otextbuf = NULL; doc->docIn->otextlen = 0; doc->docIn->otextsize = 0; } else { uint len = doc->docIn->otextlen; tmbstr buf1 = (tmbstr)TidyDocAlloc(doc, len - count + 1); tmbstr buf2 = (tmbstr)TidyDocAlloc(doc, count + 1); uint i, j; /* strncpy? */ for (i = 0; i < len - count; ++i) buf1[i] = doc->docIn->otextbuf[i]; buf1[i] = 0; for (j = 0; j + i < len; ++j) buf2[j] = doc->docIn->otextbuf[j + i]; buf2[j] = 0; TidyDocFree(doc, doc->docIn->otextbuf); node->otext = buf1; doc->docIn->otextbuf = buf2; doc->docIn->otextlen = count; doc->docIn->otextsize = count + 1; } } #endif Node* TY_(TextToken)( Lexer *lexer ) { Node *node = TY_(NewNode)( lexer->allocator, lexer ); node->start = lexer->txtstart; node->end = lexer->txtend; return node; } /* used for creating preformatted text from Word2000 */ Node *TY_(NewLineNode)( Lexer *lexer ) { Node *node = TY_(NewNode)( lexer->allocator, lexer ); node->start = lexer->lexsize; TY_(AddCharToLexer)( lexer, (uint)'\n' ); node->end = lexer->lexsize; return node; } /* used for adding a   for Word2000 */ Node* TY_(NewLiteralTextNode)( Lexer *lexer, ctmbstr txt ) { Node *node = TY_(NewNode)( lexer->allocator, lexer ); node->start = lexer->lexsize; AddStringToLexer( lexer, txt ); node->end = lexer->lexsize; return node; } static Node* TagToken( TidyDocImpl* doc, NodeType type ) { Lexer* lexer = doc->lexer; Node* node = TY_(NewNode)( lexer->allocator, lexer ); node->type = type; node->element = TY_(tmbstrndup)( doc->allocator, lexer->lexbuf + lexer->txtstart, lexer->txtend - lexer->txtstart ); node->start = lexer->txtstart; node->end = lexer->txtstart; if ( type == StartTag || type == StartEndTag || type == EndTag ) TY_(FindTag)(doc, node); return node; } static Node* NewToken(TidyDocImpl* doc, NodeType type) { Lexer* lexer = doc->lexer; Node* node = TY_(NewNode)(lexer->allocator, lexer); node->type = type; node->start = lexer->txtstart; node->end = lexer->txtend; #ifdef TIDY_STORE_ORIGINAL_TEXT StoreOriginalTextInToken(doc, node, 0); #endif return node; } #define CommentToken(doc) NewToken(doc, CommentTag) #define DocTypeToken(doc) NewToken(doc, DocTypeTag) #define PIToken(doc) NewToken(doc, ProcInsTag) #define AspToken(doc) NewToken(doc, AspTag) #define JsteToken(doc) NewToken(doc, JsteTag) #define PhpToken(doc) NewToken(doc, PhpTag) #define XmlDeclToken(doc) NewToken(doc, XmlDecl) #define SectionToken(doc) NewToken(doc, SectionTag) #define CDATAToken(doc) NewToken(doc, CDATATag) void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ) { byte c; while(0 != (c = *str++) ) { /*\ * Issue #286 * Previously this used TY_(AddCharToLexer)( lexer, c ); * which uses err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count ); * But this is transferring already 'translated' data from an * internal location to the lexer, so should use AddByte() \*/ AddByte( lexer, c ); } } /* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ) { byte c; int ix; for ( ix=0; ix < len && (c = *str++); ++ix ) TY_(AddCharToLexer)(lexer, c); } */ /* find doctype element */ Node *TY_(FindDocType)( TidyDocImpl* doc ) { Node* node; for ( node = (doc ? doc->root.content : NULL); node && node->type != DocTypeTag; node = node->next ) /**/; return node; } /* find parent container element */ Node* TY_(FindContainer)( Node* node ) { for ( node = (node ? node->parent : NULL); node && TY_(nodeHasCM)(node, CM_INLINE); node = node->parent ) /**/; return node; } /* find html element */ Node *TY_(FindHTML)( TidyDocImpl* doc ) { Node *node; for ( node = (doc ? doc->root.content : NULL); node && !nodeIsHTML(node); node = node->next ) /**/; return node; } /* find XML Declaration */ Node *TY_(FindXmlDecl)(TidyDocImpl* doc) { Node *node; for ( node = (doc ? doc->root.content : NULL); node && !(node->type == XmlDecl); node = node->next ) /**/; return node; } Node *TY_(FindHEAD)( TidyDocImpl* doc ) { Node *node = TY_(FindHTML)( doc ); if ( node ) { for ( node = node->content; node && !nodeIsHEAD(node); node = node->next ) /**/; } return node; } Node *TY_(FindTITLE)(TidyDocImpl* doc) { Node *node = TY_(FindHEAD)(doc); if (node) for (node = node->content; node && !nodeIsTITLE(node); node = node->next) {} return node; } Node *TY_(FindBody)( TidyDocImpl* doc ) { Node *node = ( doc ? doc->root.content : NULL ); while ( node && !nodeIsHTML(node) ) node = node->next; if (node == NULL) return NULL; node = node->content; while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) ) node = node->next; if ( node && nodeIsFRAMESET(node) ) { node = node->content; while ( node && !nodeIsNOFRAMES(node) ) node = node->next; if ( node ) { node = node->content; while ( node && !nodeIsBODY(node) ) node = node->next; } } return node; } /* add meta element for Tidy */ Bool TY_(AddGenerator)( TidyDocImpl* doc ) { AttVal *attval; Node *node; Node *head = TY_(FindHEAD)( doc ); tmbchar buf[256]; if (head) { #ifdef PLATFORM_NAME TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 for "PLATFORM_NAME" version %s", tidyLibraryVersion()); #else TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 version %s", tidyLibraryVersion()); #endif for ( node = head->content; node; node = node->next ) { if ( nodeIsMETA(node) ) { attval = TY_(AttrGetById)(node, TidyAttr_NAME); if (AttrValueIs(attval, "generator")) { attval = TY_(AttrGetById)(node, TidyAttr_CONTENT); if (AttrHasValue(attval) && TY_(tmbstrncasecmp)(attval->value, "HTML Tidy", 9) == 0) { /* update the existing content to reflect the */ /* actual version of Tidy currently being used */ TidyDocFree(doc, attval->value); attval->value = TY_(tmbstrdup)(doc->allocator, buf); return no; } } } } if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) { node = TY_(InferredTag)(doc, TidyTag_META); TY_(AddAttribute)( doc, node, "name", "generator" ); TY_(AddAttribute)( doc, node, "content", buf ); TY_(InsertNodeAtStart)( head, node ); return yes; } } return no; } /*\ examine to identify version * Issue #167 and #169 * If HTML5 * * * else others \*/ static uint FindGivenVersion( TidyDocImpl* doc, Node* doctype ) { AttVal * fpi = TY_(GetAttrByName)(doctype, "PUBLIC"); uint vers; if (!fpi || !fpi->value) { if (doctype->element && (TY_(tmbstrcmp)(doctype->element,"html") == 0)) { return VERS_HTML5; /* TODO: do we need to check MORE? */ } /* TODO: Consider warning, error message */ return VERS_UNKNOWN; } vers = GetVersFromFPI(fpi->value); if (VERS_XHTML & vers) { TY_(SetOptionBool)(doc, TidyXmlOut, yes); TY_(SetOptionBool)(doc, TidyXhtmlOut, yes); doc->lexer->isvoyager = yes; } /* todo: add a warning if case does not match? */ TidyDocFree(doc, fpi->value); fpi->value = TY_(tmbstrdup)(doc->allocator, GetFPIFromVers(vers)); return vers; } /* return guessed version */ uint TY_(ApparentVersion)( TidyDocImpl* doc ) { if ((doc->lexer->doctype == XH11 || doc->lexer->doctype == XB10) && (doc->lexer->versions & doc->lexer->doctype)) return doc->lexer->doctype; else return TY_(HTMLVersion)(doc); } ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool ARG_UNUSED(isXhtml) ) { ctmbstr name = GetNameFromVers(vers); /* this test has moved to ReportMarkupVersion() in localize.c, for localization reasons */ /* if (!name) name = "HTML Proprietary"; */ return name; } Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc ) { Bool isXhtml = doc->lexer->isvoyager; Node* doctype; /* Do not warn in XHTML mode */ if ( isXhtml ) return no; /* Do not warn if emitted doctype is proprietary */ if ( TY_(HTMLVersionNameFromCode)(doc->lexer->versionEmitted, isXhtml ) == NULL ) return no; /* Do not warn if no SI is possible */ if ( GetSIFromVers(doc->lexer->versionEmitted) == NULL ) return no; if ( (doctype = TY_(FindDocType)( doc )) != NULL && TY_(GetAttrByName)(doctype, "SYSTEM") == NULL ) return yes; return no; } /* Put DOCTYPE declaration between the ** declaration, if any, ** and the tag. Should also work for any comments, ** etc. that may precede the tag. */ static Node* NewDocTypeNode( TidyDocImpl* doc ) { Node* doctype = NULL; Node* html = TY_(FindHTML)( doc ); if ( !html ) return NULL; doctype = TY_(NewNode)( doc->allocator, NULL ); doctype->type = DocTypeTag; TY_(InsertNodeBeforeElement)(html, doctype); return doctype; } Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc ) { Lexer *lexer = doc->lexer; Node *doctype = TY_(FindDocType)( doc ); TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode); ctmbstr pub = "PUBLIC"; ctmbstr sys = "SYSTEM"; lexer->versionEmitted = TY_(ApparentVersion)( doc ); if (dtmode == TidyDoctypeOmit) { if (doctype) TY_(DiscardElement)(doc, doctype); return yes; } if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype)) return no; if (!doctype) { doctype = NewDocTypeNode(doc); doctype->element = TY_(tmbstrdup)(doc->allocator, "html"); } else { doctype->element = TY_(tmbstrtolower)(doctype->element); } switch(dtmode) { case TidyDoctypeHtml5: /* HTML5 */ TY_(RepairAttrValue)(doc, doctype, pub, NULL); TY_(RepairAttrValue)(doc, doctype, sys, NULL); lexer->versionEmitted = XH50; break; case TidyDoctypeStrict: /* XHTML 1.0 Strict */ TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S)); TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S)); lexer->versionEmitted = X10S; break; case TidyDoctypeLoose: /* XHTML 1.0 Transitional */ TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T)); TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T)); lexer->versionEmitted = X10T; break; case TidyDoctypeUser: /* user defined document type declaration */ TY_(RepairAttrValue)(doc, doctype, pub, cfgStr(doc, TidyDoctype)); TY_(RepairAttrValue)(doc, doctype, sys, ""); break; case TidyDoctypeAuto: if (lexer->doctype == VERS_UNKNOWN || lexer->doctype == VERS_HTML5) { lexer->versionEmitted = XH50; return yes; } else if (lexer->versions & XH11 && lexer->doctype == XH11) { if (!TY_(GetAttrByName)(doctype, sys)) TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11)); lexer->versionEmitted = XH11; return yes; } else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40)) { TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(XH11)); TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11)); lexer->versionEmitted = XH11; } else if (lexer->versions & XB10 && lexer->doctype == XB10) { if (!TY_(GetAttrByName)(doctype, sys)) TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XB10)); lexer->versionEmitted = XB10; return yes; } else if (lexer->versions & VERS_HTML40_STRICT) { TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S)); TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S)); lexer->versionEmitted = X10S; } else if (lexer->versions & VERS_FRAMESET) { TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10F)); TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10F)); lexer->versionEmitted = X10F; } else if (lexer->versions & VERS_LOOSE) { TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T)); TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T)); lexer->versionEmitted = X10T; } else if (lexer->versions & VERS_HTML5) { /*\ * Issue #273 - If still a html5/xhtml5 bit * existing, that is the 'ConstrainVersion' has * not eliminated all HTML5, then nothing to do here. * Certainly do **not** delete the DocType node! * see: http://www.w3.org/QA/Tips/Doctype \*/ } else { if (doctype) TY_(DiscardElement)(doc, doctype); return no; } break; case TidyDoctypeOmit: assert(0); break; } return no; } /* fixup doctype if missing */ Bool TY_(FixDocType)( TidyDocImpl* doc ) { Lexer* lexer = doc->lexer; Node* doctype = TY_(FindDocType)( doc ); uint dtmode = cfg( doc, TidyDoctypeMode ); uint guessed = VERS_UNKNOWN; Bool hadSI = no; /* Issue #167 - found doctype, and doctype is default VERS_HTML5, set VERS_HTML5 and return yes */ if (doctype && (dtmode == TidyDoctypeAuto) && (lexer->doctype == VERS_HTML5) ) { /* The version emitted cannot be a composite value! */ lexer->versionEmitted = HT50; return yes; } if (dtmode == TidyDoctypeAuto && lexer->versions & lexer->doctype && !(VERS_XHTML & lexer->doctype && !lexer->isvoyager) && TY_(FindDocType)(doc)) { lexer->versionEmitted = lexer->doctype; return yes; } if (dtmode == TidyDoctypeOmit) { if (doctype) TY_(DiscardElement)( doc, doctype ); lexer->versionEmitted = TY_(ApparentVersion)( doc ); return yes; } if (cfgBool(doc, TidyXmlOut)) return yes; if (doctype) hadSI = TY_(GetAttrByName)(doctype, "SYSTEM") != NULL; if ((dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose) && doctype) { TY_(DiscardElement)(doc, doctype); doctype = NULL; } switch (dtmode) { case TidyDoctypeHtml5: guessed = HT50; break; case TidyDoctypeStrict: guessed = H41S; break; case TidyDoctypeLoose: guessed = H41T; break; case TidyDoctypeAuto: guessed = TY_(HTMLVersion)(doc); break; } lexer->versionEmitted = guessed; if (guessed == VERS_UNKNOWN) return no; if (doctype) { doctype->element = TY_(tmbstrtolower)(doctype->element); } else { doctype = NewDocTypeNode(doc); doctype->element = TY_(tmbstrdup)(doc->allocator, "html"); } TY_(RepairAttrValue)(doc, doctype, "PUBLIC", GetFPIFromVers(guessed)); if (hadSI) TY_(RepairAttrValue)(doc, doctype, "SYSTEM", GetSIFromVers(guessed)); return yes; } /* ensure XML document starts with */ /* add encoding attribute if not using ASCII or UTF-8 output */ Bool TY_(FixXmlDecl)( TidyDocImpl* doc ) { Node* xml; AttVal *version, *encoding; Lexer*lexer = doc->lexer; Node* root = &doc->root; if ( root->content && root->content->type == XmlDecl ) { xml = root->content; } else { xml = TY_(NewNode)(lexer->allocator, lexer); xml->type = XmlDecl; if ( root->content ) TY_(InsertNodeBeforeElement)(root->content, xml); else root->content = xml; } version = TY_(GetAttrByName)(xml, "version"); encoding = TY_(GetAttrByName)(xml, "encoding"); /* We need to insert a check if declared encoding and output encoding mismatch and fix the XML declaration accordingly!!! */ if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 ) { ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); if ( enc ) TY_(AddAttribute)( doc, xml, "encoding", enc ); } if ( version == NULL ) TY_(AddAttribute)( doc, xml, "version", "1.0" ); return yes; } Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id) { Lexer *lexer = doc->lexer; Node *node = TY_(NewNode)( lexer->allocator, lexer ); const Dict* dict = TY_(LookupTagDef)(id); assert( dict != NULL ); node->type = StartTag; node->implicit = yes; node->element = TY_(tmbstrdup)(doc->allocator, dict->name); node->tag = dict; node->start = lexer->txtstart; node->end = lexer->txtend; return node; } static Bool ExpectsContent(Node *node) { if (node->type != StartTag) return no; /* unknown element? */ if (node->tag == NULL) return yes; if (node->tag->model & CM_EMPTY) return no; return yes; } /* create a text node for the contents of a CDATA element like style or script which ends with for some foo. */ typedef enum { CDATA_INTERMEDIATE, CDATA_STARTTAG, CDATA_ENDTAG } CDATAState; static Node *GetCDATA( TidyDocImpl* doc, Node *container ) { Lexer* lexer = doc->lexer; uint start = 0; int nested = 0; CDATAState state = CDATA_INTERMEDIATE; uint i; Bool isEmpty = yes; Bool matches = no; uint c; Bool hasSrc = (TY_(AttrGetById)(container, TidyAttr_SRC) != NULL) ? yes : no; /*\ Issue #65 (1642186) and #280 - is script or style, and the option on * If yes, then avoid incrementing nested... \*/ Bool nonested = ((nodeIsSCRIPT(container) || (nodeIsSTYLE(container))) && cfgBool(doc, TidySkipNested)) ? yes : no; SetLexerLocus( doc, lexer ); lexer->waswhite = no; lexer->txtstart = lexer->txtend = lexer->lexsize; /* seen start tag, look for matching end tag */ while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) { TY_(AddCharToLexer)(lexer, c); lexer->txtend = lexer->lexsize; if (state == CDATA_INTERMEDIATE) { if (c != '<') { if (isEmpty && !TY_(IsWhite)(c)) isEmpty = no; continue; } c = TY_(ReadChar)(doc->docIn); if (TY_(IsLetter)(c)) { /*