9cf97d536b
This bug was first openned in 2009 by Christophe Chenon, as bug sf905 but the patch provided then never made it into the source. Now appears fixed, 7 years later!
425 lines
14 KiB
C
425 lines
14 KiB
C
/* entities.c -- recognize HTML ISO entities
|
|
|
|
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
|
|
See tidy.h for the copyright notice.
|
|
|
|
Entity handling can be static because there are no config or
|
|
document-specific values. Lookup table is 100% defined at
|
|
compile time.
|
|
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include "entities.h"
|
|
#include "tidy-int.h"
|
|
#include "tmbstr.h"
|
|
|
|
struct _entity;
|
|
typedef struct _entity entity;
|
|
|
|
struct _entity
|
|
{
|
|
ctmbstr name;
|
|
uint versions;
|
|
uint code;
|
|
};
|
|
|
|
|
|
static const entity entities[] =
|
|
{
|
|
/*
|
|
** Markup pre-defined character entities
|
|
*/
|
|
{ "quot", VERS_ALL|VERS_XML, 34 },
|
|
{ "amp", VERS_ALL|VERS_XML, 38 },
|
|
{ "apos", VERS_FROM40|VERS_XML, 39 },
|
|
{ "lt", VERS_ALL|VERS_XML, 60 },
|
|
{ "gt", VERS_ALL|VERS_XML, 62 },
|
|
|
|
/*
|
|
** Latin-1 character entities
|
|
*/
|
|
{ "nbsp", VERS_ALL, 160 },
|
|
{ "iexcl", VERS_ALL, 161 },
|
|
{ "cent", VERS_ALL, 162 },
|
|
{ "pound", VERS_ALL, 163 },
|
|
{ "curren", VERS_ALL, 164 },
|
|
{ "yen", VERS_ALL, 165 },
|
|
{ "brvbar", VERS_ALL, 166 },
|
|
{ "sect", VERS_ALL, 167 },
|
|
{ "uml", VERS_ALL, 168 },
|
|
{ "copy", VERS_ALL, 169 },
|
|
{ "ordf", VERS_ALL, 170 },
|
|
{ "laquo", VERS_ALL, 171 },
|
|
{ "not", VERS_ALL, 172 },
|
|
{ "shy", VERS_ALL, 173 },
|
|
{ "reg", VERS_ALL, 174 },
|
|
{ "macr", VERS_ALL, 175 },
|
|
{ "deg", VERS_ALL, 176 },
|
|
{ "plusmn", VERS_ALL, 177 },
|
|
{ "sup2", VERS_ALL, 178 },
|
|
{ "sup3", VERS_ALL, 179 },
|
|
{ "acute", VERS_ALL, 180 },
|
|
{ "micro", VERS_ALL, 181 },
|
|
{ "para", VERS_ALL, 182 },
|
|
{ "middot", VERS_ALL, 183 },
|
|
{ "cedil", VERS_ALL, 184 },
|
|
{ "sup1", VERS_ALL, 185 },
|
|
{ "ordm", VERS_ALL, 186 },
|
|
{ "raquo", VERS_ALL, 187 },
|
|
{ "frac14", VERS_ALL, 188 },
|
|
{ "frac12", VERS_ALL, 189 },
|
|
{ "frac34", VERS_ALL, 190 },
|
|
{ "iquest", VERS_ALL, 191 },
|
|
{ "Agrave", VERS_ALL, 192 },
|
|
{ "Aacute", VERS_ALL, 193 },
|
|
{ "Acirc", VERS_ALL, 194 },
|
|
{ "Atilde", VERS_ALL, 195 },
|
|
{ "Auml", VERS_ALL, 196 },
|
|
{ "Aring", VERS_ALL, 197 },
|
|
{ "AElig", VERS_ALL, 198 },
|
|
{ "Ccedil", VERS_ALL, 199 },
|
|
{ "Egrave", VERS_ALL, 200 },
|
|
{ "Eacute", VERS_ALL, 201 },
|
|
{ "Ecirc", VERS_ALL, 202 },
|
|
{ "Euml", VERS_ALL, 203 },
|
|
{ "Igrave", VERS_ALL, 204 },
|
|
{ "Iacute", VERS_ALL, 205 },
|
|
{ "Icirc", VERS_ALL, 206 },
|
|
{ "Iuml", VERS_ALL, 207 },
|
|
{ "ETH", VERS_ALL, 208 },
|
|
{ "Ntilde", VERS_ALL, 209 },
|
|
{ "Ograve", VERS_ALL, 210 },
|
|
{ "Oacute", VERS_ALL, 211 },
|
|
{ "Ocirc", VERS_ALL, 212 },
|
|
{ "Otilde", VERS_ALL, 213 },
|
|
{ "Ouml", VERS_ALL, 214 },
|
|
{ "times", VERS_ALL, 215 },
|
|
{ "Oslash", VERS_ALL, 216 },
|
|
{ "Ugrave", VERS_ALL, 217 },
|
|
{ "Uacute", VERS_ALL, 218 },
|
|
{ "Ucirc", VERS_ALL, 219 },
|
|
{ "Uuml", VERS_ALL, 220 },
|
|
{ "Yacute", VERS_ALL, 221 },
|
|
{ "THORN", VERS_ALL, 222 },
|
|
{ "szlig", VERS_ALL, 223 },
|
|
{ "agrave", VERS_ALL, 224 },
|
|
{ "aacute", VERS_ALL, 225 },
|
|
{ "acirc", VERS_ALL, 226 },
|
|
{ "atilde", VERS_ALL, 227 },
|
|
{ "auml", VERS_ALL, 228 },
|
|
{ "aring", VERS_ALL, 229 },
|
|
{ "aelig", VERS_ALL, 230 },
|
|
{ "ccedil", VERS_ALL, 231 },
|
|
{ "egrave", VERS_ALL, 232 },
|
|
{ "eacute", VERS_ALL, 233 },
|
|
{ "ecirc", VERS_ALL, 234 },
|
|
{ "euml", VERS_ALL, 235 },
|
|
{ "igrave", VERS_ALL, 236 },
|
|
{ "iacute", VERS_ALL, 237 },
|
|
{ "icirc", VERS_ALL, 238 },
|
|
{ "iuml", VERS_ALL, 239 },
|
|
{ "eth", VERS_ALL, 240 },
|
|
{ "ntilde", VERS_ALL, 241 },
|
|
{ "ograve", VERS_ALL, 242 },
|
|
{ "oacute", VERS_ALL, 243 },
|
|
{ "ocirc", VERS_ALL, 244 },
|
|
{ "otilde", VERS_ALL, 245 },
|
|
{ "ouml", VERS_ALL, 246 },
|
|
{ "divide", VERS_ALL, 247 },
|
|
{ "oslash", VERS_ALL, 248 },
|
|
{ "ugrave", VERS_ALL, 249 },
|
|
{ "uacute", VERS_ALL, 250 },
|
|
{ "ucirc", VERS_ALL, 251 },
|
|
{ "uuml", VERS_ALL, 252 },
|
|
{ "yacute", VERS_ALL, 253 },
|
|
{ "thorn", VERS_ALL, 254 },
|
|
{ "yuml", VERS_ALL, 255 },
|
|
|
|
/*
|
|
** Extended Entities defined in HTML 4: Symbols
|
|
*/
|
|
{ "fnof", VERS_FROM40, 402 },
|
|
{ "Alpha", VERS_FROM40, 913 },
|
|
{ "Beta", VERS_FROM40, 914 },
|
|
{ "Gamma", VERS_FROM40, 915 },
|
|
{ "Delta", VERS_FROM40, 916 },
|
|
{ "Epsilon", VERS_FROM40, 917 },
|
|
{ "Zeta", VERS_FROM40, 918 },
|
|
{ "Eta", VERS_FROM40, 919 },
|
|
{ "Theta", VERS_FROM40, 920 },
|
|
{ "Iota", VERS_FROM40, 921 },
|
|
{ "Kappa", VERS_FROM40, 922 },
|
|
{ "Lambda", VERS_FROM40, 923 },
|
|
{ "Mu", VERS_FROM40, 924 },
|
|
{ "Nu", VERS_FROM40, 925 },
|
|
{ "Xi", VERS_FROM40, 926 },
|
|
{ "Omicron", VERS_FROM40, 927 },
|
|
{ "Pi", VERS_FROM40, 928 },
|
|
{ "Rho", VERS_FROM40, 929 },
|
|
{ "Sigma", VERS_FROM40, 931 },
|
|
{ "Tau", VERS_FROM40, 932 },
|
|
{ "Upsilon", VERS_FROM40, 933 },
|
|
{ "Phi", VERS_FROM40, 934 },
|
|
{ "Chi", VERS_FROM40, 935 },
|
|
{ "Psi", VERS_FROM40, 936 },
|
|
{ "Omega", VERS_FROM40, 937 },
|
|
{ "alpha", VERS_FROM40, 945 },
|
|
{ "beta", VERS_FROM40, 946 },
|
|
{ "gamma", VERS_FROM40, 947 },
|
|
{ "delta", VERS_FROM40, 948 },
|
|
{ "epsilon", VERS_FROM40, 949 },
|
|
{ "zeta", VERS_FROM40, 950 },
|
|
{ "eta", VERS_FROM40, 951 },
|
|
{ "theta", VERS_FROM40, 952 },
|
|
{ "iota", VERS_FROM40, 953 },
|
|
{ "kappa", VERS_FROM40, 954 },
|
|
{ "lambda", VERS_FROM40, 955 },
|
|
{ "mu", VERS_FROM40, 956 },
|
|
{ "nu", VERS_FROM40, 957 },
|
|
{ "xi", VERS_FROM40, 958 },
|
|
{ "omicron", VERS_FROM40, 959 },
|
|
{ "pi", VERS_FROM40, 960 },
|
|
{ "rho", VERS_FROM40, 961 },
|
|
{ "sigmaf", VERS_FROM40, 962 },
|
|
{ "sigma", VERS_FROM40, 963 },
|
|
{ "tau", VERS_FROM40, 964 },
|
|
{ "upsilon", VERS_FROM40, 965 },
|
|
{ "phi", VERS_FROM40, 966 },
|
|
{ "chi", VERS_FROM40, 967 },
|
|
{ "psi", VERS_FROM40, 968 },
|
|
{ "omega", VERS_FROM40, 969 },
|
|
{ "thetasym", VERS_FROM40, 977 },
|
|
{ "upsih", VERS_FROM40, 978 },
|
|
{ "piv", VERS_FROM40, 982 },
|
|
{ "bull", VERS_FROM40, 8226 },
|
|
{ "hellip", VERS_FROM40, 8230 },
|
|
{ "prime", VERS_FROM40, 8242 },
|
|
{ "Prime", VERS_FROM40, 8243 },
|
|
{ "oline", VERS_FROM40, 8254 },
|
|
{ "frasl", VERS_FROM40, 8260 },
|
|
{ "weierp", VERS_FROM40, 8472 },
|
|
{ "image", VERS_FROM40, 8465 },
|
|
{ "real", VERS_FROM40, 8476 },
|
|
{ "trade", VERS_FROM40, 8482 },
|
|
{ "alefsym", VERS_FROM40, 8501 },
|
|
{ "larr", VERS_FROM40, 8592 },
|
|
{ "uarr", VERS_FROM40, 8593 },
|
|
{ "rarr", VERS_FROM40, 8594 },
|
|
{ "darr", VERS_FROM40, 8595 },
|
|
{ "harr", VERS_FROM40, 8596 },
|
|
{ "crarr", VERS_FROM40, 8629 },
|
|
{ "lArr", VERS_FROM40, 8656 },
|
|
{ "uArr", VERS_FROM40, 8657 },
|
|
{ "rArr", VERS_FROM40, 8658 },
|
|
{ "dArr", VERS_FROM40, 8659 },
|
|
{ "hArr", VERS_FROM40, 8660 },
|
|
{ "forall", VERS_FROM40, 8704 },
|
|
{ "part", VERS_FROM40, 8706 },
|
|
{ "exist", VERS_FROM40, 8707 },
|
|
{ "empty", VERS_FROM40, 8709 },
|
|
{ "nabla", VERS_FROM40, 8711 },
|
|
{ "isin", VERS_FROM40, 8712 },
|
|
{ "notin", VERS_FROM40, 8713 },
|
|
{ "ni", VERS_FROM40, 8715 },
|
|
{ "prod", VERS_FROM40, 8719 },
|
|
{ "sum", VERS_FROM40, 8721 },
|
|
{ "minus", VERS_FROM40, 8722 },
|
|
{ "lowast", VERS_FROM40, 8727 },
|
|
{ "radic", VERS_FROM40, 8730 },
|
|
{ "prop", VERS_FROM40, 8733 },
|
|
{ "infin", VERS_FROM40, 8734 },
|
|
{ "ang", VERS_FROM40, 8736 },
|
|
{ "and", VERS_FROM40, 8743 },
|
|
{ "or", VERS_FROM40, 8744 },
|
|
{ "cap", VERS_FROM40, 8745 },
|
|
{ "cup", VERS_FROM40, 8746 },
|
|
{ "int", VERS_FROM40, 8747 },
|
|
{ "there4", VERS_FROM40, 8756 },
|
|
{ "sim", VERS_FROM40, 8764 },
|
|
{ "cong", VERS_FROM40, 8773 },
|
|
{ "asymp", VERS_FROM40, 8776 },
|
|
{ "ne", VERS_FROM40, 8800 },
|
|
{ "equiv", VERS_FROM40, 8801 },
|
|
{ "le", VERS_FROM40, 8804 },
|
|
{ "ge", VERS_FROM40, 8805 },
|
|
{ "sub", VERS_FROM40, 8834 },
|
|
{ "sup", VERS_FROM40, 8835 },
|
|
{ "nsub", VERS_FROM40, 8836 },
|
|
{ "sube", VERS_FROM40, 8838 },
|
|
{ "supe", VERS_FROM40, 8839 },
|
|
{ "oplus", VERS_FROM40, 8853 },
|
|
{ "otimes", VERS_FROM40, 8855 },
|
|
{ "perp", VERS_FROM40, 8869 },
|
|
{ "sdot", VERS_FROM40, 8901 },
|
|
{ "lceil", VERS_FROM40, 8968 },
|
|
{ "rceil", VERS_FROM40, 8969 },
|
|
{ "lfloor", VERS_FROM40, 8970 },
|
|
{ "rfloor", VERS_FROM40, 8971 },
|
|
{ "lang", VERS_FROM40, 10216 },
|
|
{ "rang", VERS_FROM40, 10217 },
|
|
{ "loz", VERS_FROM40, 9674 },
|
|
{ "spades", VERS_FROM40, 9824 },
|
|
{ "clubs", VERS_FROM40, 9827 },
|
|
{ "hearts", VERS_FROM40, 9829 },
|
|
{ "diams", VERS_FROM40, 9830 },
|
|
|
|
/*
|
|
** Extended Entities defined in HTML 4: Special (less Markup at top)
|
|
*/
|
|
{ "OElig", VERS_FROM40, 338 },
|
|
{ "oelig", VERS_FROM40, 339 },
|
|
{ "Scaron", VERS_FROM40, 352 },
|
|
{ "scaron", VERS_FROM40, 353 },
|
|
{ "Yuml", VERS_FROM40, 376 },
|
|
{ "circ", VERS_FROM40, 710 },
|
|
{ "tilde", VERS_FROM40, 732 },
|
|
{ "ensp", VERS_FROM40, 8194 },
|
|
{ "emsp", VERS_FROM40, 8195 },
|
|
{ "thinsp", VERS_FROM40, 8201 },
|
|
{ "zwnj", VERS_FROM40, 8204 },
|
|
{ "zwj", VERS_FROM40, 8205 },
|
|
{ "lrm", VERS_FROM40, 8206 },
|
|
{ "rlm", VERS_FROM40, 8207 },
|
|
{ "ndash", VERS_FROM40, 8211 },
|
|
{ "mdash", VERS_FROM40, 8212 },
|
|
{ "lsquo", VERS_FROM40, 8216 },
|
|
{ "rsquo", VERS_FROM40, 8217 },
|
|
{ "sbquo", VERS_FROM40, 8218 },
|
|
{ "ldquo", VERS_FROM40, 8220 },
|
|
{ "rdquo", VERS_FROM40, 8221 },
|
|
{ "bdquo", VERS_FROM40, 8222 },
|
|
{ "dagger", VERS_FROM40, 8224 },
|
|
{ "Dagger", VERS_FROM40, 8225 },
|
|
{ "permil", VERS_FROM40, 8240 },
|
|
{ "lsaquo", VERS_FROM40, 8249 },
|
|
{ "rsaquo", VERS_FROM40, 8250 },
|
|
{ "euro", VERS_FROM40, 8364 },
|
|
{ NULL, VERS_UNKNOWN, 0 }
|
|
};
|
|
|
|
|
|
/* Pure static implementation. Trades off lookup speed
|
|
** for faster setup time (well, none actually).
|
|
** Optimization of comparing 1st character buys enough
|
|
** speed that hash doesn't improve things without > 500
|
|
** items in list.
|
|
*/
|
|
static const entity* entitiesLookup( ctmbstr s )
|
|
{
|
|
tmbchar ch = (tmbchar)( s ? *s : 0 );
|
|
const entity *np;
|
|
for ( np = entities; ch && np && np->name; ++np )
|
|
if ( ch == *np->name && TY_(tmbstrcmp)(s, np->name) == 0 )
|
|
return np;
|
|
return NULL;
|
|
}
|
|
|
|
#if 0
|
|
/* entity starting with "&" returns zero on error */
|
|
uint EntityCode( ctmbstr name, uint versions )
|
|
{
|
|
const entity* np;
|
|
assert( name && name[0] == '&' );
|
|
|
|
/* numeric entitity: name = "&#" followed by number */
|
|
if ( name[1] == '#' )
|
|
{
|
|
uint c = 0; /* zero on missing/bad number */
|
|
Bool isXml = ( (versions & VERS_XML) == VERS_XML );
|
|
|
|
/* 'x' prefix denotes hexadecimal number format */
|
|
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
|
|
sscanf( name+3, "%x", &c );
|
|
else
|
|
sscanf( name+2, "%u", &c );
|
|
|
|
return (uint) c;
|
|
}
|
|
|
|
/* Named entity: name ="&" followed by a name */
|
|
if ( NULL != (np = entitiesLookup(name+1)) )
|
|
{
|
|
/* Only recognize entity name if version supports it. */
|
|
if ( np->versions & versions )
|
|
return np->code;
|
|
}
|
|
|
|
return 0; /* zero signifies unknown entity name */
|
|
}
|
|
#endif
|
|
|
|
Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions )
|
|
{
|
|
const entity* np;
|
|
int res;
|
|
assert( name && name[0] == '&' );
|
|
assert( code != NULL );
|
|
assert( versions != NULL );
|
|
|
|
/* numeric entitity: name = "&#" followed by number */
|
|
if ( name[1] == '#' )
|
|
{
|
|
uint c = 0; /* zero on missing/bad number */
|
|
|
|
/* 'x' prefix denotes hexadecimal number format */
|
|
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
|
|
res = sscanf( name+3, "%x", &c );
|
|
else
|
|
res = sscanf( name+2, "%u", &c );
|
|
|
|
/* Issue #373 - Null Char in XML result doc - sf905 2009 */
|
|
if ( res == 1 )
|
|
{
|
|
*code = c;
|
|
*versions = VERS_ALL;
|
|
return yes;
|
|
}
|
|
else
|
|
{
|
|
*code = 0;
|
|
*versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
|
|
return no;
|
|
}
|
|
}
|
|
|
|
/* Named entity: name ="&" followed by a name */
|
|
if ( NULL != (np = entitiesLookup(name+1)) )
|
|
{
|
|
*code = np->code;
|
|
*versions = np->versions;
|
|
return yes;
|
|
}
|
|
|
|
*code = 0;
|
|
*versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
|
|
return no;
|
|
}
|
|
|
|
|
|
ctmbstr TY_(EntityName)( uint ch, uint versions )
|
|
{
|
|
ctmbstr entnam = NULL;
|
|
const entity *ep;
|
|
|
|
for ( ep = entities; ep->name != NULL; ++ep )
|
|
{
|
|
if ( ep->code == ch )
|
|
{
|
|
if ( ep->versions & versions )
|
|
entnam = ep->name;
|
|
break; /* Found code. Stop search. */
|
|
}
|
|
}
|
|
return entnam;
|
|
}
|
|
|
|
/*
|
|
* local variables:
|
|
* mode: c
|
|
* indent-tabs-mode: nil
|
|
* c-basic-offset: 4
|
|
* eval: (c-set-offset 'substatement-open 0)
|
|
* end:
|
|
*/
|