More updates for Issue #483 - Start warning msgs - WIP

This commit is contained in:
Geoff McLane 2017-02-09 20:55:23 +01:00
parent 3ca117550a
commit 75bc1f06c7
4 changed files with 58 additions and 15 deletions

View file

@ -328,6 +328,9 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = {
{ "ENCODING_MISMATCH", ENCODING_MISMATCH }, { "ENCODING_MISMATCH", ENCODING_MISMATCH },
{ "INVALID_URI", INVALID_URI }, { "INVALID_URI", INVALID_URI },
{ "INVALID_NCR", INVALID_NCR }, { "INVALID_NCR", INVALID_NCR },
{ "BAD_SURROGATE_PAIR", BAD_SURROGATE_PAIR },
{ "BAD_SURROGATE_TAIL", BAD_SURROGATE_TAIL },
{ "BAD_SURROGATE_LEAD", BAD_SURROGATE_LEAD },
{ "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST }, { "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST },
#if SUPPORT_ACCESSIBILITY_CHECKS #if SUPPORT_ACCESSIBILITY_CHECKS
/* This blocks of codes comes from `accessErrorCodes` enum. */ /* This blocks of codes comes from `accessErrorCodes` enum. */

View file

@ -336,6 +336,9 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ INVALID_UTF8, 0, "%s invalid UTF-8 bytes (char. code %s)" }, /* Error */ { INVALID_UTF8, 0, "%s invalid UTF-8 bytes (char. code %s)" }, /* Error */
{ INVALID_UTF16, 0, "%s invalid UTF-16 surrogate pair (char. code %s)" }, /* Error */ { INVALID_UTF16, 0, "%s invalid UTF-16 surrogate pair (char. code %s)" }, /* Error */
{ INVALID_NCR, 0, "%s invalid numeric character reference %s" }, /* Error */ { INVALID_NCR, 0, "%s invalid numeric character reference %s" }, /* Error */
{ BAD_SURROGATE_PAIR, 0, "Have out-of-range surrogate pair U+%s:U+%s, replaced with 2 U+FFFD values."}, /* warning */
{ BAD_SURROGATE_TAIL, 0, "Leading (High) surrogate pair U+%s, with no trailing (Low) entity, replaced with U+FFFD." }, /* warning */
{ BAD_SURROGATE_LEAD, 0, "Trailing (Low) surrogate pair U+%s, with no leading (High) entity, replaced with U+FFFD." }, /* warning */
/* ReportEntityError */ /* ReportEntityError */
{ MISSING_SEMICOLON, 0, "entity \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */ { MISSING_SEMICOLON, 0, "entity \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */

View file

@ -1034,25 +1034,33 @@ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
} }
/* /*
Issue #483
Have detected the first of a surrogate pair... Have detected the first of a surrogate pair...
Try to find, decode the second... Try to find, decode the second...
Already have '&' start... Already have '&' start...
*/ */
static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
typedef enum {
SP_ok,
SP_failed,
SP_error
}SPStatus;
static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
{ {
Lexer* lexer = doc->lexer; Lexer* lexer = doc->lexer;
uint bufSize = 32; uint bufSize = 32;
uint c, ch, offset = 0; uint c, ch, offset = 0;
tmbstr buf = 0; tmbstr buf = 0;
Bool success = no; /* assume failed */ SPStatus status = SP_error; /* assume failed */
int type = 0; /* assume numeric */ int type = 0; /* assume numeric */
uint fch = *pch; uint fch = *pch;
int i; /* has to be signed due to for i >= 0 */ int i; /* has to be signed due to for i >= 0 */
if (!lexer) if (!lexer)
return no; return status;
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize); buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
if (!buf) if (!buf)
return no; return status;
while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream ) while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
{ {
if (c == ';') if (c == ';')
@ -1107,12 +1115,21 @@ static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
if (TY_(IsValidCombinedChar)(ch)) if (TY_(IsValidCombinedChar)(ch))
{ {
*pch = ch; /* return combined pair value */ *pch = ch; /* return combined pair value */
success = yes; status = SP_ok; /* full success - pair used */
} }
} else
}
if (!success)
{ {
status = SP_failed; /* is one of the 32 out-of-range pairs */
*pch = 0xFFFD; /* return substitute character */
/* SP WARNING: - BAD_SURROGATE_PAIR */
fprintf(stderr, "Warning: Have out-of-range surrogate pair U+%04X:U+%04X, replaced with U+FFFD value.\n", fch, ch);
}
}
}
if (status == SP_error)
{
/* Error condition - can only put back all the chars */
if (c == ';') /* if last, not added to buffer */ if (c == ';') /* if last, not added to buffer */
TY_(UngetChar)(c, doc->docIn); TY_(UngetChar)(c, doc->docIn);
if (buf && offset) if (buf && offset)
@ -1129,7 +1146,7 @@ static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
if (buf) if (buf)
TidyFree(lexer->allocator, buf); TidyFree(lexer->allocator, buf);
return success; return status;
} }
/* /*
@ -1267,18 +1284,34 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
uint c1; uint c1;
if ((c1 = TY_(ReadChar)(doc->docIn)) == '&') if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
{ {
/* have a following entity */ SPStatus status;
if (!GetSurrogatePair(doc, isXml, &ch)) /* Have a following entity,
so there is a chance of having a valid surrogate pair */
c1 = ch; /* keep first value, in case of error */
status = GetSurrogatePair(doc, isXml, &ch);
if (status == SP_error)
{ {
TY_(UngetChar)(c1, doc->docIn); /* otherwise put it back */ /* SP WARNING: BAD_SURROGATE_TAIL - use substitute character */
fprintf(stderr, "Warning: Leading(High) surrogate pair U+%04X, with no trailing(Low) entity, replaced with U+FFFD.\n", c1);
TY_(UngetChar)('&', doc->docIn); /* otherwise put it back */
} }
} }
else else
{ {
/* otherwise put it back */ /* put this non-entity lead char back */
TY_(UngetChar)(c1, doc->docIn); TY_(UngetChar)(c1, doc->docIn);
/* Have leading surrogate pair, with no tail */
/* SP WARNING: BAD_SURROGATE_TAIL - use substitute character */
fprintf(stderr, "Warning: Leading(High) surrogate pair U+%04X, with no trailing(Low) entity, replaced with U+FFFD.\n", ch);
ch = 0xFFFD;
} }
}
else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch))
{
/* Have trailing surrogate pair, with no lead */
/* SP WARNING: - BAD_SURROGATE_LEAD - - use substitute character */
fprintf(stderr, "Warning: Trailing (Low) surrogate pair U+%04X, with no leading (High) entity, replaced with U+FFFD.\n", ch);
ch = 0xFFFD;
} }
/* deal with unrecognized or invalid entities */ /* deal with unrecognized or invalid entities */

View file

@ -186,6 +186,10 @@ typedef enum {
INVALID_URI, INVALID_URI,
INVALID_NCR, INVALID_NCR,
BAD_SURROGATE_PAIR,
BAD_SURROGATE_TAIL,
BAD_SURROGATE_LEAD,
/* This MUST be present and last. */ /* This MUST be present and last. */
CODES_TIDY_ERROR_LAST CODES_TIDY_ERROR_LAST
} tidyErrorCodes; } tidyErrorCodes;