More updates for Issue #483 - Start warning msgs - WIP

This commit is contained in:
Geoff McLane 2017-02-09 20:55:23 +01:00
parent 3ca117550a
commit 75bc1f06c7
4 changed files with 58 additions and 15 deletions

View file

@ -328,6 +328,9 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = {
{ "ENCODING_MISMATCH", ENCODING_MISMATCH },
{ "INVALID_URI", INVALID_URI },
{ "INVALID_NCR", INVALID_NCR },
{ "BAD_SURROGATE_PAIR", BAD_SURROGATE_PAIR },
{ "BAD_SURROGATE_TAIL", BAD_SURROGATE_TAIL },
{ "BAD_SURROGATE_LEAD", BAD_SURROGATE_LEAD },
{ "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST },
#if SUPPORT_ACCESSIBILITY_CHECKS
/* This blocks of codes comes from `accessErrorCodes` enum. */

View file

@ -336,7 +336,10 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ INVALID_UTF8, 0, "%s invalid UTF-8 bytes (char. code %s)" }, /* Error */
{ INVALID_UTF16, 0, "%s invalid UTF-16 surrogate pair (char. code %s)" }, /* Error */
{ INVALID_NCR, 0, "%s invalid numeric character reference %s" }, /* Error */
{ BAD_SURROGATE_PAIR, 0, "Have out-of-range surrogate pair U+%s:U+%s, replaced with 2 U+FFFD values."}, /* warning */
{ BAD_SURROGATE_TAIL, 0, "Leading (High) surrogate pair U+%s, with no trailing (Low) entity, replaced with U+FFFD." }, /* warning */
{ BAD_SURROGATE_LEAD, 0, "Trailing (Low) surrogate pair U+%s, with no leading (High) entity, replaced with U+FFFD." }, /* warning */
/* ReportEntityError */
{ MISSING_SEMICOLON, 0, "entity \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */
{ MISSING_SEMICOLON_NCR, 0, "numeric character reference \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */

View file

@ -1034,25 +1034,33 @@ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
}
/*
Issue #483
Have detected the first of a surrogate pair...
Try to find, decode the second...
Already have '&' start...
*/
static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
typedef enum {
SP_ok,
SP_failed,
SP_error
}SPStatus;
static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
{
Lexer* lexer = doc->lexer;
uint bufSize = 32;
uint c, ch, offset = 0;
tmbstr buf = 0;
Bool success = no; /* assume failed */
SPStatus status = SP_error; /* assume failed */
int type = 0; /* assume numeric */
uint fch = *pch;
int i; /* has to be signed due to for i >= 0 */
if (!lexer)
return no;
return status;
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
if (!buf)
return no;
return status;
while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
{
if (c == ';')
@ -1107,12 +1115,21 @@ static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
if (TY_(IsValidCombinedChar)(ch))
{
*pch = ch; /* return combined pair value */
success = yes;
status = SP_ok; /* full success - pair used */
}
else
{
status = SP_failed; /* is one of the 32 out-of-range pairs */
*pch = 0xFFFD; /* return substitute character */
/* SP WARNING: - BAD_SURROGATE_PAIR */
fprintf(stderr, "Warning: Have out-of-range surrogate pair U+%04X:U+%04X, replaced with U+FFFD value.\n", fch, ch);
}
}
}
if (!success)
if (status == SP_error)
{
/* Error condition - can only put back all the chars */
if (c == ';') /* if last, not added to buffer */
TY_(UngetChar)(c, doc->docIn);
if (buf && offset)
@ -1129,7 +1146,7 @@ static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
if (buf)
TidyFree(lexer->allocator, buf);
return success;
return status;
}
/*
@ -1265,20 +1282,36 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
if (!preserveEntities && found && TY_(IsLowSurrogate)(ch))
{
uint c1;
if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
{
/* have a following entity */
if (!GetSurrogatePair(doc, isXml, &ch))
SPStatus status;
/* Have a following entity,
so there is a chance of having a valid surrogate pair */
c1 = ch; /* keep first value, in case of error */
status = GetSurrogatePair(doc, isXml, &ch);
if (status == SP_error)
{
TY_(UngetChar)(c1, doc->docIn); /* otherwise put it back */
/* SP WARNING: BAD_SURROGATE_TAIL - use substitute character */
fprintf(stderr, "Warning: Leading(High) surrogate pair U+%04X, with no trailing(Low) entity, replaced with U+FFFD.\n", c1);
TY_(UngetChar)('&', doc->docIn); /* otherwise put it back */
}
}
else
else
{
/* otherwise put it back */
/* put this non-entity lead char back */
TY_(UngetChar)(c1, doc->docIn);
/* Have leading surrogate pair, with no tail */
/* SP WARNING: BAD_SURROGATE_TAIL - use substitute character */
fprintf(stderr, "Warning: Leading(High) surrogate pair U+%04X, with no trailing(Low) entity, replaced with U+FFFD.\n", ch);
ch = 0xFFFD;
}
}
else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch))
{
/* Have trailing surrogate pair, with no lead */
/* SP WARNING: - BAD_SURROGATE_LEAD - - use substitute character */
fprintf(stderr, "Warning: Trailing (Low) surrogate pair U+%04X, with no leading (High) entity, replaced with U+FFFD.\n", ch);
ch = 0xFFFD;
}
/* deal with unrecognized or invalid entities */

View file

@ -186,6 +186,10 @@ typedef enum {
INVALID_URI,
INVALID_NCR,
BAD_SURROGATE_PAIR,
BAD_SURROGATE_TAIL,
BAD_SURROGATE_LEAD,
/* This MUST be present and last. */
CODES_TIDY_ERROR_LAST
} tidyErrorCodes;