Merge branch 'surrogates'

This commit is contained in:
Jim Derry 2017-02-13 08:49:06 -05:00
commit b7c84b1b57
7 changed files with 212 additions and 2 deletions

41
README/MESSAGES.md Normal file
View File

@ -0,0 +1,41 @@
# Message System
Tidy has quite complex warning/error messaging system. This is all about adding a **new** warning or error message to **libTidy**.
First assign the message a key value. This is done in `message.h`, in one of the two enumerations that are listed there.
1. `tidyErrorCodes` - starts with the value `CODES_TIDY_ERROR_FIRST = 200`, and it must be first.
2. `tidyMessagesMisc` - starts with the value ACCESS_URL = 2048 - so, at present the above `tidyErrorCodes` must not exceed this.
3. For the sake of completeness, there's also a third enum present in `access.h` called `accessErrorCodes`; you should only ever be concerned about this if you are working on new strings for Tidy's accessibility module.
If your message is something that will appear in the error list, then its key should be defined in the `tidyErrorCodes` enum, unless you are adding errors to the accessibility module (see point 3, above). If you are adding strings that are _not_ intended for the error list, then they belong in `tidyMessagesMisc`. These are strings that are typically output with Tidy's CLI.
All enum values are only ever used by name within **libTidy** (and incidentally, should only ever be used by name in your client applications; never trust the value!), so feel free to enter new strings wherever they make the most sense. There are already existing categories (marked by comments), or feel free to create a new category if that's best.
Because some clients retrieve error information via `libTidy`s callback mechanism, it's also important to update the `language.c:tidyErrorFilterKeysStruct[]`, as well, if your new messages are intended for the error list.
## Step 1
So in this case I want to add 3 warning messages: `BAD_SURROGATE_PAIR`, `BAD_SURROGATE_TAIL`, and `BAD_SURROGATE_LEAD`. Because these are error messages, they belong in the `tidyErrorCodes` enum, and they fit into nicely into the "character encoding errors" category just before the **last** `CODES_TIDY_ERROR_LAST`.
## Step 2
Because the new messages are error code, update the `tidyErrorFilterKeysStruct` in `language.c` with the same key values, and with string representations thereof. You should put them in the same logical order as you inserted them into `tidyErrorCodes` enum.
Note that at some point when all of the error enums are merged (probably Tidy 5.5) this kludge won't have to be used and we can have a nice, single enum exported to clients.
## Step 3
The next step is adding a `format` string to `language_en.h`. This string may later be translated to various supported language strings, but at present it is important that the other language translated strings, like `language_fr.h`, `language_es.h`, etc, keep the same format order.
Where to add this seems a bit of a mess, but in general things are grouped by where they're used in `libTidy`, and often in alphabetical order within those groups. Here I've added them relative to where they were placed in the other enums and structs.
Depending on which of the output routines you use (consult `message.c`) you may be able to use parameters such as `%u` and `%s` in your format strings. The available data is currently limited to the available message output routines, but perhaps generalizing this in order to make more data available will be a nice focus of Tidy 5.5. Please don't use `printf` for message output within **libTidy**.
In this case I want to add showing the code point(s) in hex, so I need to add that also. **(jim --??)**
eof;

View File

@ -328,6 +328,9 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = {
{ "ENCODING_MISMATCH", ENCODING_MISMATCH },
{ "INVALID_URI", INVALID_URI },
{ "INVALID_NCR", INVALID_NCR },
{ "BAD_SURROGATE_PAIR", BAD_SURROGATE_PAIR },
{ "BAD_SURROGATE_TAIL", BAD_SURROGATE_TAIL },
{ "BAD_SURROGATE_LEAD", BAD_SURROGATE_LEAD },
{ "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST },
#if SUPPORT_ACCESSIBILITY_CHECKS
/* This blocks of codes comes from `accessErrorCodes` enum. */

View File

@ -336,7 +336,10 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ INVALID_UTF8, 0, "%s invalid UTF-8 bytes (char. code %s)" }, /* Error */
{ INVALID_UTF16, 0, "%s invalid UTF-16 surrogate pair (char. code %s)" }, /* Error */
{ INVALID_NCR, 0, "%s invalid numeric character reference %s" }, /* Error */
{ BAD_SURROGATE_PAIR, 0, "Have out-of-range surrogate pair U+%04X:U+%04X, replaced with U+FFFD value."}, /* warning */
{ BAD_SURROGATE_TAIL, 0, "Leading (High) surrogate pair U+%04X, with no trailing (Low) entity, replaced with U+FFFD." }, /* warning */
{ BAD_SURROGATE_LEAD, 0, "Trailing (Low) surrogate pair U+%04X, with no leading (High) entity, replaced with U+FFFD." }, /* warning */
/* ReportEntityError */
{ MISSING_SEMICOLON, 0, "entity \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */
{ MISSING_SEMICOLON_NCR, 0, "numeric character reference \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */

View File

@ -1033,6 +1033,121 @@ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
lexer->columns = doc->docIn->curcol;
}
/*
Issue #483
Have detected the first of a surrogate pair...
Try to find, decode the second...
Already have '&' start...
*/
typedef enum {
SP_ok,
SP_failed,
SP_error
}SPStatus;
static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
{
Lexer* lexer = doc->lexer;
uint bufSize = 32;
uint c, ch, offset = 0;
tmbstr buf = 0;
SPStatus status = SP_error; /* assume failed */
int type = 0; /* assume numeric */
uint fch = *pch;
int i; /* has to be signed due to for i >= 0 */
if (!lexer)
return status;
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
if (!buf)
return status;
while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
{
if (c == ';')
{
break; /* reached end of entity */
}
if ((offset + 2) > bufSize)
{
bufSize *= 2;
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
if (!buf)
{
break;
}
}
buf[offset++] = c; /* add char to buffer */
if (offset == 1)
{
if (c != '#') /* is a numeric entity */
break;
}
else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X')))
{
type = 1; /* set hex digits */
}
else
{
if (type) /* if hex digits */
{
if (!IsDigitHex(c))
break;
}
else /* if numeric */
{
if (!TY_(IsDigit)(c))
break;
}
}
}
if (c == ';')
{
buf[offset] = 0;
if (type)
sscanf(buf + 2, "%x", &ch);
else
sscanf(buf + 1, "%d", &ch);
if (TY_(IsHighSurrogate)(ch))
{
ch = TY_(CombineSurrogatePair)(ch, fch);
if (TY_(IsValidCombinedChar)(ch))
{
*pch = ch; /* return combined pair value */
status = SP_ok; /* full success - pair used */
}
else
{
status = SP_failed; /* is one of the 32 out-of-range pairs */
*pch = 0xFFFD; /* return substitute character */
TY_(ReportSurrogateError)(doc, BAD_SURROGATE_PAIR, fch, ch); /* SP WARNING: - */
}
}
}
if (status == SP_error)
{
/* Error condition - can only put back all the chars */
if (c == ';') /* if last, not added to buffer */
TY_(UngetChar)(c, doc->docIn);
if (buf && offset)
{
/* correct the order for unget - last first */
for (i = offset - 1; i >= 0; i--)
{
c = buf[i];
TY_(UngetChar)(c, doc->docIn);
}
}
}
if (buf)
TidyFree(lexer->allocator, buf);
return status;
}
/*
No longer attempts to insert missing ';' for unknown
enitities unless one was present already, since this
@ -1159,6 +1274,42 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
}
/* Issue #483 - Deal with 'surrogate pairs' */
/* TODO: Maybe warning/error, like found a leading surrogate
but no following surrogate! Maybe should avoid outputting
invalid utf-8 for this entity - maybe substitute? */
if (!preserveEntities && found && TY_(IsLowSurrogate)(ch))
{
uint c1;
if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
{
SPStatus status;
/* Have a following entity,
so there is a chance of having a valid surrogate pair */
c1 = ch; /* keep first value, in case of error */
status = GetSurrogatePair(doc, isXml, &ch);
if (status == SP_error)
{
TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, c1, 0); /* SP WARNING: - using substitute character */
TY_(UngetChar)('&', doc->docIn); /* otherwise put it back */
}
}
else
{
/* put this non-entity lead char back */
TY_(UngetChar)(c1, doc->docIn);
/* Have leading surrogate pair, with no tail */
TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, ch, 0); /* SP WARNING: - using substitute character */
ch = 0xFFFD;
}
}
else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch))
{
/* Have trailing surrogate pair, with no lead */
TY_(ReportSurrogateError)(doc, BAD_SURROGATE_LEAD, ch, 0); /* SP WARNING: - using substitute character */
ch = 0xFFFD;
}
/* deal with unrecognized or invalid entities */
/* #433012 - fix by Randy Waki 17 Feb 01 */
/* report invalid NCR's - Terry Teague 01 Sep 01 */

View File

@ -530,6 +530,13 @@ void TY_(ReportEntityError)( TidyDocImpl* doc, uint code, ctmbstr entity,
messageLexer( doc, TidyWarning, code, fmt, entityname );
}
void TY_(ReportSurrogateError)(TidyDocImpl* doc, uint code, uint c1, uint c2)
{
ctmbstr fmt = tidyLocalizedString(code);
if (fmt)
messageLexer(doc, TidyWarning, code, fmt, c1, c2);
}
void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code)
{
char const *name = "NULL", *value = "NULL";

View File

@ -46,6 +46,7 @@ void TY_(ReportEncodingError)(TidyDocImpl* doc, uint code, uint c, Bool discarde
void TY_(ReportEntityError)( TidyDocImpl* doc, uint code, ctmbstr entity, int c );
void TY_(ReportAttrError)( TidyDocImpl* doc, Node* node, AttVal* av, uint code );
void TY_(ReportMissingAttr)( TidyDocImpl* doc, Node* node, ctmbstr name );
void TY_(ReportSurrogateError)(TidyDocImpl* doc, uint code, uint c1, uint c2);
#if SUPPORT_ACCESSIBILITY_CHECKS
@ -186,6 +187,10 @@ typedef enum {
INVALID_URI,
INVALID_NCR,
BAD_SURROGATE_PAIR,
BAD_SURROGATE_TAIL,
BAD_SURROGATE_LEAD,
/* This MUST be present and last. */
CODES_TIDY_ERROR_LAST
} tidyErrorCodes;

View File

@ -1,2 +1,2 @@
5.3.16
5.3.17
2017.02.12