Merge branch 'surrogates'
This commit is contained in:
commit
b7c84b1b57
41
README/MESSAGES.md
Normal file
41
README/MESSAGES.md
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
# Message System
|
||||||
|
|
||||||
|
Tidy has quite complex warning/error messaging system. This is all about adding a **new** warning or error message to **libTidy**.
|
||||||
|
|
||||||
|
First assign the message a key value. This is done in `message.h`, in one of the two enumerations that are listed there.
|
||||||
|
|
||||||
|
1. `tidyErrorCodes` - starts with the value `CODES_TIDY_ERROR_FIRST = 200`, and it must be first.
|
||||||
|
|
||||||
|
2. `tidyMessagesMisc` - starts with the value ACCESS_URL = 2048 - so, at present the above `tidyErrorCodes` must not exceed this.
|
||||||
|
|
||||||
|
3. For the sake of completeness, there's also a third enum present in `access.h` called `accessErrorCodes`; you should only ever be concerned about this if you are working on new strings for Tidy's accessibility module.
|
||||||
|
|
||||||
|
If your message is something that will appear in the error list, then its key should be defined in the `tidyErrorCodes` enum, unless you are adding errors to the accessibility module (see point 3, above). If you are adding strings that are _not_ intended for the error list, then they belong in `tidyMessagesMisc`. These are strings that are typically output with Tidy's CLI.
|
||||||
|
|
||||||
|
All enum values are only ever used by name within **libTidy** (and incidentally, should only ever be used by name in your client applications; never trust the value!), so feel free to enter new strings wherever they make the most sense. There are already existing categories (marked by comments), or feel free to create a new category if that's best.
|
||||||
|
|
||||||
|
Because some clients retrieve error information via `libTidy`’s callback mechanism, it's also important to update the `language.c:tidyErrorFilterKeysStruct[]`, as well, if your new messages are intended for the error list.
|
||||||
|
|
||||||
|
|
||||||
|
## Step 1
|
||||||
|
|
||||||
|
So in this case I want to add 3 warning messages: `BAD_SURROGATE_PAIR`, `BAD_SURROGATE_TAIL`, and `BAD_SURROGATE_LEAD`. Because these are error messages, they belong in the `tidyErrorCodes` enum, and they fit into nicely into the "character encoding errors" category just before the **last** `CODES_TIDY_ERROR_LAST`.
|
||||||
|
|
||||||
|
|
||||||
|
## Step 2
|
||||||
|
|
||||||
|
Because the new messages are error code, update the `tidyErrorFilterKeysStruct` in `language.c` with the same key values, and with string representations thereof. You should put them in the same logical order as you inserted them into `tidyErrorCodes` enum.
|
||||||
|
|
||||||
|
Note that at some point when all of the error enums are merged (probably Tidy 5.5) this kludge won't have to be used and we can have a nice, single enum exported to clients.
|
||||||
|
|
||||||
|
## Step 3
|
||||||
|
|
||||||
|
The next step is adding a `format` string to `language_en.h`. This string may later be translated to various supported language strings, but at present it is important that the other language translated strings, like `language_fr.h`, `language_es.h`, etc, keep the same format order.
|
||||||
|
|
||||||
|
Where to add this seems a bit of a mess, but in general things are grouped by where they're used in `libTidy`, and often in alphabetical order within those groups. Here I've added them relative to where they were placed in the other enums and structs.
|
||||||
|
|
||||||
|
Depending on which of the output routines you use (consult `message.c`) you may be able to use parameters such as `%u` and `%s` in your format strings. The available data is currently limited to the available message output routines, but perhaps generalizing this in order to make more data available will be a nice focus of Tidy 5.5. Please don't use `printf` for message output within **libTidy**.
|
||||||
|
|
||||||
|
In this case I want to add showing the code point(s) in hex, so I need to add that also. **(jim --??)**
|
||||||
|
|
||||||
|
eof;
|
|
@ -328,6 +328,9 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = {
|
||||||
{ "ENCODING_MISMATCH", ENCODING_MISMATCH },
|
{ "ENCODING_MISMATCH", ENCODING_MISMATCH },
|
||||||
{ "INVALID_URI", INVALID_URI },
|
{ "INVALID_URI", INVALID_URI },
|
||||||
{ "INVALID_NCR", INVALID_NCR },
|
{ "INVALID_NCR", INVALID_NCR },
|
||||||
|
{ "BAD_SURROGATE_PAIR", BAD_SURROGATE_PAIR },
|
||||||
|
{ "BAD_SURROGATE_TAIL", BAD_SURROGATE_TAIL },
|
||||||
|
{ "BAD_SURROGATE_LEAD", BAD_SURROGATE_LEAD },
|
||||||
{ "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST },
|
{ "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST },
|
||||||
#if SUPPORT_ACCESSIBILITY_CHECKS
|
#if SUPPORT_ACCESSIBILITY_CHECKS
|
||||||
/* This blocks of codes comes from `accessErrorCodes` enum. */
|
/* This blocks of codes comes from `accessErrorCodes` enum. */
|
||||||
|
|
|
@ -336,6 +336,9 @@ static languageDefinition language_en = { whichPluralForm_en, {
|
||||||
{ INVALID_UTF8, 0, "%s invalid UTF-8 bytes (char. code %s)" }, /* Error */
|
{ INVALID_UTF8, 0, "%s invalid UTF-8 bytes (char. code %s)" }, /* Error */
|
||||||
{ INVALID_UTF16, 0, "%s invalid UTF-16 surrogate pair (char. code %s)" }, /* Error */
|
{ INVALID_UTF16, 0, "%s invalid UTF-16 surrogate pair (char. code %s)" }, /* Error */
|
||||||
{ INVALID_NCR, 0, "%s invalid numeric character reference %s" }, /* Error */
|
{ INVALID_NCR, 0, "%s invalid numeric character reference %s" }, /* Error */
|
||||||
|
{ BAD_SURROGATE_PAIR, 0, "Have out-of-range surrogate pair U+%04X:U+%04X, replaced with U+FFFD value."}, /* warning */
|
||||||
|
{ BAD_SURROGATE_TAIL, 0, "Leading (High) surrogate pair U+%04X, with no trailing (Low) entity, replaced with U+FFFD." }, /* warning */
|
||||||
|
{ BAD_SURROGATE_LEAD, 0, "Trailing (Low) surrogate pair U+%04X, with no leading (High) entity, replaced with U+FFFD." }, /* warning */
|
||||||
|
|
||||||
/* ReportEntityError */
|
/* ReportEntityError */
|
||||||
{ MISSING_SEMICOLON, 0, "entity \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */
|
{ MISSING_SEMICOLON, 0, "entity \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */
|
||||||
|
|
151
src/lexer.c
151
src/lexer.c
|
@ -1033,6 +1033,121 @@ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
|
||||||
lexer->columns = doc->docIn->curcol;
|
lexer->columns = doc->docIn->curcol;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Issue #483
|
||||||
|
Have detected the first of a surrogate pair...
|
||||||
|
Try to find, decode the second...
|
||||||
|
Already have '&' start...
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
SP_ok,
|
||||||
|
SP_failed,
|
||||||
|
SP_error
|
||||||
|
}SPStatus;
|
||||||
|
|
||||||
|
static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
|
||||||
|
{
|
||||||
|
Lexer* lexer = doc->lexer;
|
||||||
|
uint bufSize = 32;
|
||||||
|
uint c, ch, offset = 0;
|
||||||
|
tmbstr buf = 0;
|
||||||
|
SPStatus status = SP_error; /* assume failed */
|
||||||
|
int type = 0; /* assume numeric */
|
||||||
|
uint fch = *pch;
|
||||||
|
int i; /* has to be signed due to for i >= 0 */
|
||||||
|
if (!lexer)
|
||||||
|
return status;
|
||||||
|
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
|
||||||
|
if (!buf)
|
||||||
|
return status;
|
||||||
|
while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
|
||||||
|
{
|
||||||
|
if (c == ';')
|
||||||
|
{
|
||||||
|
break; /* reached end of entity */
|
||||||
|
}
|
||||||
|
if ((offset + 2) > bufSize)
|
||||||
|
{
|
||||||
|
bufSize *= 2;
|
||||||
|
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
|
||||||
|
if (!buf)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
buf[offset++] = c; /* add char to buffer */
|
||||||
|
if (offset == 1)
|
||||||
|
{
|
||||||
|
if (c != '#') /* is a numeric entity */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X')))
|
||||||
|
{
|
||||||
|
type = 1; /* set hex digits */
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (type) /* if hex digits */
|
||||||
|
{
|
||||||
|
if (!IsDigitHex(c))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else /* if numeric */
|
||||||
|
{
|
||||||
|
if (!TY_(IsDigit)(c))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c == ';')
|
||||||
|
{
|
||||||
|
buf[offset] = 0;
|
||||||
|
if (type)
|
||||||
|
sscanf(buf + 2, "%x", &ch);
|
||||||
|
else
|
||||||
|
sscanf(buf + 1, "%d", &ch);
|
||||||
|
|
||||||
|
if (TY_(IsHighSurrogate)(ch))
|
||||||
|
{
|
||||||
|
ch = TY_(CombineSurrogatePair)(ch, fch);
|
||||||
|
if (TY_(IsValidCombinedChar)(ch))
|
||||||
|
{
|
||||||
|
*pch = ch; /* return combined pair value */
|
||||||
|
status = SP_ok; /* full success - pair used */
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
status = SP_failed; /* is one of the 32 out-of-range pairs */
|
||||||
|
*pch = 0xFFFD; /* return substitute character */
|
||||||
|
TY_(ReportSurrogateError)(doc, BAD_SURROGATE_PAIR, fch, ch); /* SP WARNING: - */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (status == SP_error)
|
||||||
|
{
|
||||||
|
/* Error condition - can only put back all the chars */
|
||||||
|
if (c == ';') /* if last, not added to buffer */
|
||||||
|
TY_(UngetChar)(c, doc->docIn);
|
||||||
|
if (buf && offset)
|
||||||
|
{
|
||||||
|
/* correct the order for unget - last first */
|
||||||
|
for (i = offset - 1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
c = buf[i];
|
||||||
|
TY_(UngetChar)(c, doc->docIn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (buf)
|
||||||
|
TidyFree(lexer->allocator, buf);
|
||||||
|
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
No longer attempts to insert missing ';' for unknown
|
No longer attempts to insert missing ';' for unknown
|
||||||
enitities unless one was present already, since this
|
enitities unless one was present already, since this
|
||||||
|
@ -1159,6 +1274,42 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
|
||||||
found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
|
found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Issue #483 - Deal with 'surrogate pairs' */
|
||||||
|
/* TODO: Maybe warning/error, like found a leading surrogate
|
||||||
|
but no following surrogate! Maybe should avoid outputting
|
||||||
|
invalid utf-8 for this entity - maybe substitute? */
|
||||||
|
if (!preserveEntities && found && TY_(IsLowSurrogate)(ch))
|
||||||
|
{
|
||||||
|
uint c1;
|
||||||
|
if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
|
||||||
|
{
|
||||||
|
SPStatus status;
|
||||||
|
/* Have a following entity,
|
||||||
|
so there is a chance of having a valid surrogate pair */
|
||||||
|
c1 = ch; /* keep first value, in case of error */
|
||||||
|
status = GetSurrogatePair(doc, isXml, &ch);
|
||||||
|
if (status == SP_error)
|
||||||
|
{
|
||||||
|
TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, c1, 0); /* SP WARNING: - using substitute character */
|
||||||
|
TY_(UngetChar)('&', doc->docIn); /* otherwise put it back */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* put this non-entity lead char back */
|
||||||
|
TY_(UngetChar)(c1, doc->docIn);
|
||||||
|
/* Have leading surrogate pair, with no tail */
|
||||||
|
TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, ch, 0); /* SP WARNING: - using substitute character */
|
||||||
|
ch = 0xFFFD;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch))
|
||||||
|
{
|
||||||
|
/* Have trailing surrogate pair, with no lead */
|
||||||
|
TY_(ReportSurrogateError)(doc, BAD_SURROGATE_LEAD, ch, 0); /* SP WARNING: - using substitute character */
|
||||||
|
ch = 0xFFFD;
|
||||||
|
}
|
||||||
|
|
||||||
/* deal with unrecognized or invalid entities */
|
/* deal with unrecognized or invalid entities */
|
||||||
/* #433012 - fix by Randy Waki 17 Feb 01 */
|
/* #433012 - fix by Randy Waki 17 Feb 01 */
|
||||||
/* report invalid NCR's - Terry Teague 01 Sep 01 */
|
/* report invalid NCR's - Terry Teague 01 Sep 01 */
|
||||||
|
|
|
@ -530,6 +530,13 @@ void TY_(ReportEntityError)( TidyDocImpl* doc, uint code, ctmbstr entity,
|
||||||
messageLexer( doc, TidyWarning, code, fmt, entityname );
|
messageLexer( doc, TidyWarning, code, fmt, entityname );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TY_(ReportSurrogateError)(TidyDocImpl* doc, uint code, uint c1, uint c2)
|
||||||
|
{
|
||||||
|
ctmbstr fmt = tidyLocalizedString(code);
|
||||||
|
if (fmt)
|
||||||
|
messageLexer(doc, TidyWarning, code, fmt, c1, c2);
|
||||||
|
}
|
||||||
|
|
||||||
void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code)
|
void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code)
|
||||||
{
|
{
|
||||||
char const *name = "NULL", *value = "NULL";
|
char const *name = "NULL", *value = "NULL";
|
||||||
|
|
|
@ -46,6 +46,7 @@ void TY_(ReportEncodingError)(TidyDocImpl* doc, uint code, uint c, Bool discarde
|
||||||
void TY_(ReportEntityError)( TidyDocImpl* doc, uint code, ctmbstr entity, int c );
|
void TY_(ReportEntityError)( TidyDocImpl* doc, uint code, ctmbstr entity, int c );
|
||||||
void TY_(ReportAttrError)( TidyDocImpl* doc, Node* node, AttVal* av, uint code );
|
void TY_(ReportAttrError)( TidyDocImpl* doc, Node* node, AttVal* av, uint code );
|
||||||
void TY_(ReportMissingAttr)( TidyDocImpl* doc, Node* node, ctmbstr name );
|
void TY_(ReportMissingAttr)( TidyDocImpl* doc, Node* node, ctmbstr name );
|
||||||
|
void TY_(ReportSurrogateError)(TidyDocImpl* doc, uint code, uint c1, uint c2);
|
||||||
|
|
||||||
#if SUPPORT_ACCESSIBILITY_CHECKS
|
#if SUPPORT_ACCESSIBILITY_CHECKS
|
||||||
|
|
||||||
|
@ -186,6 +187,10 @@ typedef enum {
|
||||||
INVALID_URI,
|
INVALID_URI,
|
||||||
INVALID_NCR,
|
INVALID_NCR,
|
||||||
|
|
||||||
|
BAD_SURROGATE_PAIR,
|
||||||
|
BAD_SURROGATE_TAIL,
|
||||||
|
BAD_SURROGATE_LEAD,
|
||||||
|
|
||||||
/* This MUST be present and last. */
|
/* This MUST be present and last. */
|
||||||
CODES_TIDY_ERROR_LAST
|
CODES_TIDY_ERROR_LAST
|
||||||
} tidyErrorCodes;
|
} tidyErrorCodes;
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
5.3.16
|
5.3.17
|
||||||
2017.02.12
|
2017.02.12
|
||||||
|
|
Loading…
Reference in a new issue