tidy-html5/src/entities.c

425 lines
14 KiB
C
Raw Normal View History

2011-11-17 02:44:16 +00:00
/* entities.c -- recognize HTML ISO entities
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Entity handling can be static because there are no config or
document-specific values. Lookup table is 100% defined at
compile time.
*/
#include <stdio.h>
#include "entities.h"
#include "tidy-int.h"
#include "tmbstr.h"
struct _entity;
typedef struct _entity entity;
struct _entity
{
ctmbstr name;
uint versions;
uint code;
};
static const entity entities[] =
{
/*
** Markup pre-defined character entities
*/
{ "quot", VERS_ALL|VERS_XML, 34 },
{ "amp", VERS_ALL|VERS_XML, 38 },
{ "apos", VERS_FROM40|VERS_XML, 39 },
{ "lt", VERS_ALL|VERS_XML, 60 },
{ "gt", VERS_ALL|VERS_XML, 62 },
/*
** Latin-1 character entities
*/
{ "nbsp", VERS_ALL, 160 },
{ "iexcl", VERS_ALL, 161 },
{ "cent", VERS_ALL, 162 },
{ "pound", VERS_ALL, 163 },
{ "curren", VERS_ALL, 164 },
{ "yen", VERS_ALL, 165 },
{ "brvbar", VERS_ALL, 166 },
{ "sect", VERS_ALL, 167 },
{ "uml", VERS_ALL, 168 },
{ "copy", VERS_ALL, 169 },
{ "ordf", VERS_ALL, 170 },
{ "laquo", VERS_ALL, 171 },
{ "not", VERS_ALL, 172 },
{ "shy", VERS_ALL, 173 },
{ "reg", VERS_ALL, 174 },
{ "macr", VERS_ALL, 175 },
{ "deg", VERS_ALL, 176 },
{ "plusmn", VERS_ALL, 177 },
{ "sup2", VERS_ALL, 178 },
{ "sup3", VERS_ALL, 179 },
{ "acute", VERS_ALL, 180 },
{ "micro", VERS_ALL, 181 },
{ "para", VERS_ALL, 182 },
{ "middot", VERS_ALL, 183 },
{ "cedil", VERS_ALL, 184 },
{ "sup1", VERS_ALL, 185 },
{ "ordm", VERS_ALL, 186 },
{ "raquo", VERS_ALL, 187 },
{ "frac14", VERS_ALL, 188 },
{ "frac12", VERS_ALL, 189 },
{ "frac34", VERS_ALL, 190 },
{ "iquest", VERS_ALL, 191 },
{ "Agrave", VERS_ALL, 192 },
{ "Aacute", VERS_ALL, 193 },
{ "Acirc", VERS_ALL, 194 },
{ "Atilde", VERS_ALL, 195 },
{ "Auml", VERS_ALL, 196 },
{ "Aring", VERS_ALL, 197 },
{ "AElig", VERS_ALL, 198 },
{ "Ccedil", VERS_ALL, 199 },
{ "Egrave", VERS_ALL, 200 },
{ "Eacute", VERS_ALL, 201 },
{ "Ecirc", VERS_ALL, 202 },
{ "Euml", VERS_ALL, 203 },
{ "Igrave", VERS_ALL, 204 },
{ "Iacute", VERS_ALL, 205 },
{ "Icirc", VERS_ALL, 206 },
{ "Iuml", VERS_ALL, 207 },
{ "ETH", VERS_ALL, 208 },
{ "Ntilde", VERS_ALL, 209 },
{ "Ograve", VERS_ALL, 210 },
{ "Oacute", VERS_ALL, 211 },
{ "Ocirc", VERS_ALL, 212 },
{ "Otilde", VERS_ALL, 213 },
{ "Ouml", VERS_ALL, 214 },
{ "times", VERS_ALL, 215 },
{ "Oslash", VERS_ALL, 216 },
{ "Ugrave", VERS_ALL, 217 },
{ "Uacute", VERS_ALL, 218 },
{ "Ucirc", VERS_ALL, 219 },
{ "Uuml", VERS_ALL, 220 },
{ "Yacute", VERS_ALL, 221 },
{ "THORN", VERS_ALL, 222 },
{ "szlig", VERS_ALL, 223 },
{ "agrave", VERS_ALL, 224 },
{ "aacute", VERS_ALL, 225 },
{ "acirc", VERS_ALL, 226 },
{ "atilde", VERS_ALL, 227 },
{ "auml", VERS_ALL, 228 },
{ "aring", VERS_ALL, 229 },
{ "aelig", VERS_ALL, 230 },
{ "ccedil", VERS_ALL, 231 },
{ "egrave", VERS_ALL, 232 },
{ "eacute", VERS_ALL, 233 },
{ "ecirc", VERS_ALL, 234 },
{ "euml", VERS_ALL, 235 },
{ "igrave", VERS_ALL, 236 },
{ "iacute", VERS_ALL, 237 },
{ "icirc", VERS_ALL, 238 },
{ "iuml", VERS_ALL, 239 },
{ "eth", VERS_ALL, 240 },
{ "ntilde", VERS_ALL, 241 },
{ "ograve", VERS_ALL, 242 },
{ "oacute", VERS_ALL, 243 },
{ "ocirc", VERS_ALL, 244 },
{ "otilde", VERS_ALL, 245 },
{ "ouml", VERS_ALL, 246 },
{ "divide", VERS_ALL, 247 },
{ "oslash", VERS_ALL, 248 },
{ "ugrave", VERS_ALL, 249 },
{ "uacute", VERS_ALL, 250 },
{ "ucirc", VERS_ALL, 251 },
{ "uuml", VERS_ALL, 252 },
{ "yacute", VERS_ALL, 253 },
{ "thorn", VERS_ALL, 254 },
{ "yuml", VERS_ALL, 255 },
/*
** Extended Entities defined in HTML 4: Symbols
*/
{ "fnof", VERS_FROM40, 402 },
{ "Alpha", VERS_FROM40, 913 },
{ "Beta", VERS_FROM40, 914 },
{ "Gamma", VERS_FROM40, 915 },
{ "Delta", VERS_FROM40, 916 },
{ "Epsilon", VERS_FROM40, 917 },
{ "Zeta", VERS_FROM40, 918 },
{ "Eta", VERS_FROM40, 919 },
{ "Theta", VERS_FROM40, 920 },
{ "Iota", VERS_FROM40, 921 },
{ "Kappa", VERS_FROM40, 922 },
{ "Lambda", VERS_FROM40, 923 },
{ "Mu", VERS_FROM40, 924 },
{ "Nu", VERS_FROM40, 925 },
{ "Xi", VERS_FROM40, 926 },
{ "Omicron", VERS_FROM40, 927 },
{ "Pi", VERS_FROM40, 928 },
{ "Rho", VERS_FROM40, 929 },
{ "Sigma", VERS_FROM40, 931 },
{ "Tau", VERS_FROM40, 932 },
{ "Upsilon", VERS_FROM40, 933 },
{ "Phi", VERS_FROM40, 934 },
{ "Chi", VERS_FROM40, 935 },
{ "Psi", VERS_FROM40, 936 },
{ "Omega", VERS_FROM40, 937 },
{ "alpha", VERS_FROM40, 945 },
{ "beta", VERS_FROM40, 946 },
{ "gamma", VERS_FROM40, 947 },
{ "delta", VERS_FROM40, 948 },
{ "epsilon", VERS_FROM40, 949 },
{ "zeta", VERS_FROM40, 950 },
{ "eta", VERS_FROM40, 951 },
{ "theta", VERS_FROM40, 952 },
{ "iota", VERS_FROM40, 953 },
{ "kappa", VERS_FROM40, 954 },
{ "lambda", VERS_FROM40, 955 },
{ "mu", VERS_FROM40, 956 },
{ "nu", VERS_FROM40, 957 },
{ "xi", VERS_FROM40, 958 },
{ "omicron", VERS_FROM40, 959 },
{ "pi", VERS_FROM40, 960 },
{ "rho", VERS_FROM40, 961 },
{ "sigmaf", VERS_FROM40, 962 },
{ "sigma", VERS_FROM40, 963 },
{ "tau", VERS_FROM40, 964 },
{ "upsilon", VERS_FROM40, 965 },
{ "phi", VERS_FROM40, 966 },
{ "chi", VERS_FROM40, 967 },
{ "psi", VERS_FROM40, 968 },
{ "omega", VERS_FROM40, 969 },
{ "thetasym", VERS_FROM40, 977 },
{ "upsih", VERS_FROM40, 978 },
{ "piv", VERS_FROM40, 982 },
{ "bull", VERS_FROM40, 8226 },
{ "hellip", VERS_FROM40, 8230 },
{ "prime", VERS_FROM40, 8242 },
{ "Prime", VERS_FROM40, 8243 },
{ "oline", VERS_FROM40, 8254 },
{ "frasl", VERS_FROM40, 8260 },
{ "weierp", VERS_FROM40, 8472 },
{ "image", VERS_FROM40, 8465 },
{ "real", VERS_FROM40, 8476 },
{ "trade", VERS_FROM40, 8482 },
{ "alefsym", VERS_FROM40, 8501 },
{ "larr", VERS_FROM40, 8592 },
{ "uarr", VERS_FROM40, 8593 },
{ "rarr", VERS_FROM40, 8594 },
{ "darr", VERS_FROM40, 8595 },
{ "harr", VERS_FROM40, 8596 },
{ "crarr", VERS_FROM40, 8629 },
{ "lArr", VERS_FROM40, 8656 },
{ "uArr", VERS_FROM40, 8657 },
{ "rArr", VERS_FROM40, 8658 },
{ "dArr", VERS_FROM40, 8659 },
{ "hArr", VERS_FROM40, 8660 },
{ "forall", VERS_FROM40, 8704 },
{ "part", VERS_FROM40, 8706 },
{ "exist", VERS_FROM40, 8707 },
{ "empty", VERS_FROM40, 8709 },
{ "nabla", VERS_FROM40, 8711 },
{ "isin", VERS_FROM40, 8712 },
{ "notin", VERS_FROM40, 8713 },
{ "ni", VERS_FROM40, 8715 },
{ "prod", VERS_FROM40, 8719 },
{ "sum", VERS_FROM40, 8721 },
{ "minus", VERS_FROM40, 8722 },
{ "lowast", VERS_FROM40, 8727 },
{ "radic", VERS_FROM40, 8730 },
{ "prop", VERS_FROM40, 8733 },
{ "infin", VERS_FROM40, 8734 },
{ "ang", VERS_FROM40, 8736 },
{ "and", VERS_FROM40, 8743 },
{ "or", VERS_FROM40, 8744 },
{ "cap", VERS_FROM40, 8745 },
{ "cup", VERS_FROM40, 8746 },
{ "int", VERS_FROM40, 8747 },
{ "there4", VERS_FROM40, 8756 },
{ "sim", VERS_FROM40, 8764 },
{ "cong", VERS_FROM40, 8773 },
{ "asymp", VERS_FROM40, 8776 },
{ "ne", VERS_FROM40, 8800 },
{ "equiv", VERS_FROM40, 8801 },
{ "le", VERS_FROM40, 8804 },
{ "ge", VERS_FROM40, 8805 },
{ "sub", VERS_FROM40, 8834 },
{ "sup", VERS_FROM40, 8835 },
{ "nsub", VERS_FROM40, 8836 },
{ "sube", VERS_FROM40, 8838 },
{ "supe", VERS_FROM40, 8839 },
{ "oplus", VERS_FROM40, 8853 },
{ "otimes", VERS_FROM40, 8855 },
{ "perp", VERS_FROM40, 8869 },
{ "sdot", VERS_FROM40, 8901 },
{ "lceil", VERS_FROM40, 8968 },
{ "rceil", VERS_FROM40, 8969 },
{ "lfloor", VERS_FROM40, 8970 },
{ "rfloor", VERS_FROM40, 8971 },
{ "lang", VERS_FROM40, 10216 },
{ "rang", VERS_FROM40, 10217 },
2011-11-17 02:44:16 +00:00
{ "loz", VERS_FROM40, 9674 },
{ "spades", VERS_FROM40, 9824 },
{ "clubs", VERS_FROM40, 9827 },
{ "hearts", VERS_FROM40, 9829 },
{ "diams", VERS_FROM40, 9830 },
/*
** Extended Entities defined in HTML 4: Special (less Markup at top)
*/
{ "OElig", VERS_FROM40, 338 },
{ "oelig", VERS_FROM40, 339 },
{ "Scaron", VERS_FROM40, 352 },
{ "scaron", VERS_FROM40, 353 },
{ "Yuml", VERS_FROM40, 376 },
{ "circ", VERS_FROM40, 710 },
{ "tilde", VERS_FROM40, 732 },
{ "ensp", VERS_FROM40, 8194 },
{ "emsp", VERS_FROM40, 8195 },
{ "thinsp", VERS_FROM40, 8201 },
{ "zwnj", VERS_FROM40, 8204 },
{ "zwj", VERS_FROM40, 8205 },
{ "lrm", VERS_FROM40, 8206 },
{ "rlm", VERS_FROM40, 8207 },
{ "ndash", VERS_FROM40, 8211 },
{ "mdash", VERS_FROM40, 8212 },
{ "lsquo", VERS_FROM40, 8216 },
{ "rsquo", VERS_FROM40, 8217 },
{ "sbquo", VERS_FROM40, 8218 },
{ "ldquo", VERS_FROM40, 8220 },
{ "rdquo", VERS_FROM40, 8221 },
{ "bdquo", VERS_FROM40, 8222 },
{ "dagger", VERS_FROM40, 8224 },
{ "Dagger", VERS_FROM40, 8225 },
{ "permil", VERS_FROM40, 8240 },
{ "lsaquo", VERS_FROM40, 8249 },
{ "rsaquo", VERS_FROM40, 8250 },
{ "euro", VERS_FROM40, 8364 },
{ NULL, VERS_UNKNOWN, 0 }
};
/* Pure static implementation. Trades off lookup speed
** for faster setup time (well, none actually).
** Optimization of comparing 1st character buys enough
** speed that hash doesn't improve things without > 500
** items in list.
*/
static const entity* entitiesLookup( ctmbstr s )
{
tmbchar ch = (tmbchar)( s ? *s : 0 );
const entity *np;
for ( np = entities; ch && np && np->name; ++np )
if ( ch == *np->name && TY_(tmbstrcmp)(s, np->name) == 0 )
return np;
return NULL;
}
#if 0
/* entity starting with "&" returns zero on error */
uint EntityCode( ctmbstr name, uint versions )
{
const entity* np;
assert( name && name[0] == '&' );
/* numeric entitity: name = "&#" followed by number */
if ( name[1] == '#' )
{
uint c = 0; /* zero on missing/bad number */
Bool isXml = ( (versions & VERS_XML) == VERS_XML );
/* 'x' prefix denotes hexadecimal number format */
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
sscanf( name+3, "%x", &c );
else
sscanf( name+2, "%u", &c );
return (uint) c;
}
/* Named entity: name ="&" followed by a name */
if ( NULL != (np = entitiesLookup(name+1)) )
{
/* Only recognize entity name if version supports it. */
if ( np->versions & versions )
return np->code;
}
return 0; /* zero signifies unknown entity name */
}
#endif
Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions )
{
const entity* np;
int res;
2011-11-17 02:44:16 +00:00
assert( name && name[0] == '&' );
assert( code != NULL );
assert( versions != NULL );
/* numeric entitity: name = "&#" followed by number */
if ( name[1] == '#' )
{
uint c = 0; /* zero on missing/bad number */
/* 'x' prefix denotes hexadecimal number format */
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
res = sscanf( name+3, "%x", &c );
2011-11-17 02:44:16 +00:00
else
res = sscanf( name+2, "%u", &c );
2011-11-17 02:44:16 +00:00
/* Issue #373 - Null Char in XML result doc - sf905 2009 */
if ( res == 1 )
{
*code = c;
*versions = VERS_ALL;
return yes;
}
else
{
*code = 0;
*versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
return no;
}
2011-11-17 02:44:16 +00:00
}
/* Named entity: name ="&" followed by a name */
if ( NULL != (np = entitiesLookup(name+1)) )
{
*code = np->code;
*versions = np->versions;
return yes;
}
*code = 0;
*versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
return no;
}
ctmbstr TY_(EntityName)( uint ch, uint versions )
{
ctmbstr entnam = NULL;
const entity *ep;
for ( ep = entities; ep->name != NULL; ++ep )
{
if ( ep->code == ch )
{
if ( ep->versions & versions )
entnam = ep->name;
break; /* Found code. Stop search. */
}
}
return entnam;
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/