More updates for Issue #483 - Start warning msgs - WIP
This commit is contained in:
parent
3ca117550a
commit
75bc1f06c7
|
@ -328,6 +328,9 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = {
|
||||||
{ "ENCODING_MISMATCH", ENCODING_MISMATCH },
|
{ "ENCODING_MISMATCH", ENCODING_MISMATCH },
|
||||||
{ "INVALID_URI", INVALID_URI },
|
{ "INVALID_URI", INVALID_URI },
|
||||||
{ "INVALID_NCR", INVALID_NCR },
|
{ "INVALID_NCR", INVALID_NCR },
|
||||||
|
{ "BAD_SURROGATE_PAIR", BAD_SURROGATE_PAIR },
|
||||||
|
{ "BAD_SURROGATE_TAIL", BAD_SURROGATE_TAIL },
|
||||||
|
{ "BAD_SURROGATE_LEAD", BAD_SURROGATE_LEAD },
|
||||||
{ "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST },
|
{ "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST },
|
||||||
#if SUPPORT_ACCESSIBILITY_CHECKS
|
#if SUPPORT_ACCESSIBILITY_CHECKS
|
||||||
/* This blocks of codes comes from `accessErrorCodes` enum. */
|
/* This blocks of codes comes from `accessErrorCodes` enum. */
|
||||||
|
|
|
@ -336,6 +336,9 @@ static languageDefinition language_en = { whichPluralForm_en, {
|
||||||
{ INVALID_UTF8, 0, "%s invalid UTF-8 bytes (char. code %s)" }, /* Error */
|
{ INVALID_UTF8, 0, "%s invalid UTF-8 bytes (char. code %s)" }, /* Error */
|
||||||
{ INVALID_UTF16, 0, "%s invalid UTF-16 surrogate pair (char. code %s)" }, /* Error */
|
{ INVALID_UTF16, 0, "%s invalid UTF-16 surrogate pair (char. code %s)" }, /* Error */
|
||||||
{ INVALID_NCR, 0, "%s invalid numeric character reference %s" }, /* Error */
|
{ INVALID_NCR, 0, "%s invalid numeric character reference %s" }, /* Error */
|
||||||
|
{ BAD_SURROGATE_PAIR, 0, "Have out-of-range surrogate pair U+%s:U+%s, replaced with 2 U+FFFD values."}, /* warning */
|
||||||
|
{ BAD_SURROGATE_TAIL, 0, "Leading (High) surrogate pair U+%s, with no trailing (Low) entity, replaced with U+FFFD." }, /* warning */
|
||||||
|
{ BAD_SURROGATE_LEAD, 0, "Trailing (Low) surrogate pair U+%s, with no leading (High) entity, replaced with U+FFFD." }, /* warning */
|
||||||
|
|
||||||
/* ReportEntityError */
|
/* ReportEntityError */
|
||||||
{ MISSING_SEMICOLON, 0, "entity \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */
|
{ MISSING_SEMICOLON, 0, "entity \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */
|
||||||
|
|
61
src/lexer.c
61
src/lexer.c
|
@ -1034,25 +1034,33 @@ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
Issue #483
|
||||||
Have detected the first of a surrogate pair...
|
Have detected the first of a surrogate pair...
|
||||||
Try to find, decode the second...
|
Try to find, decode the second...
|
||||||
Already have '&' start...
|
Already have '&' start...
|
||||||
*/
|
*/
|
||||||
static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
|
|
||||||
|
typedef enum {
|
||||||
|
SP_ok,
|
||||||
|
SP_failed,
|
||||||
|
SP_error
|
||||||
|
}SPStatus;
|
||||||
|
|
||||||
|
static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
|
||||||
{
|
{
|
||||||
Lexer* lexer = doc->lexer;
|
Lexer* lexer = doc->lexer;
|
||||||
uint bufSize = 32;
|
uint bufSize = 32;
|
||||||
uint c, ch, offset = 0;
|
uint c, ch, offset = 0;
|
||||||
tmbstr buf = 0;
|
tmbstr buf = 0;
|
||||||
Bool success = no; /* assume failed */
|
SPStatus status = SP_error; /* assume failed */
|
||||||
int type = 0; /* assume numeric */
|
int type = 0; /* assume numeric */
|
||||||
uint fch = *pch;
|
uint fch = *pch;
|
||||||
int i; /* has to be signed due to for i >= 0 */
|
int i; /* has to be signed due to for i >= 0 */
|
||||||
if (!lexer)
|
if (!lexer)
|
||||||
return no;
|
return status;
|
||||||
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
|
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
return no;
|
return status;
|
||||||
while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
|
while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
|
||||||
{
|
{
|
||||||
if (c == ';')
|
if (c == ';')
|
||||||
|
@ -1107,12 +1115,21 @@ static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
|
||||||
if (TY_(IsValidCombinedChar)(ch))
|
if (TY_(IsValidCombinedChar)(ch))
|
||||||
{
|
{
|
||||||
*pch = ch; /* return combined pair value */
|
*pch = ch; /* return combined pair value */
|
||||||
success = yes;
|
status = SP_ok; /* full success - pair used */
|
||||||
}
|
}
|
||||||
}
|
else
|
||||||
}
|
|
||||||
if (!success)
|
|
||||||
{
|
{
|
||||||
|
status = SP_failed; /* is one of the 32 out-of-range pairs */
|
||||||
|
*pch = 0xFFFD; /* return substitute character */
|
||||||
|
/* SP WARNING: - BAD_SURROGATE_PAIR */
|
||||||
|
fprintf(stderr, "Warning: Have out-of-range surrogate pair U+%04X:U+%04X, replaced with U+FFFD value.\n", fch, ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (status == SP_error)
|
||||||
|
{
|
||||||
|
/* Error condition - can only put back all the chars */
|
||||||
if (c == ';') /* if last, not added to buffer */
|
if (c == ';') /* if last, not added to buffer */
|
||||||
TY_(UngetChar)(c, doc->docIn);
|
TY_(UngetChar)(c, doc->docIn);
|
||||||
if (buf && offset)
|
if (buf && offset)
|
||||||
|
@ -1129,7 +1146,7 @@ static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
|
||||||
if (buf)
|
if (buf)
|
||||||
TidyFree(lexer->allocator, buf);
|
TidyFree(lexer->allocator, buf);
|
||||||
|
|
||||||
return success;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1267,18 +1284,34 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
|
||||||
uint c1;
|
uint c1;
|
||||||
if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
|
if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
|
||||||
{
|
{
|
||||||
/* have a following entity */
|
SPStatus status;
|
||||||
if (!GetSurrogatePair(doc, isXml, &ch))
|
/* Have a following entity,
|
||||||
|
so there is a chance of having a valid surrogate pair */
|
||||||
|
c1 = ch; /* keep first value, in case of error */
|
||||||
|
status = GetSurrogatePair(doc, isXml, &ch);
|
||||||
|
if (status == SP_error)
|
||||||
{
|
{
|
||||||
TY_(UngetChar)(c1, doc->docIn); /* otherwise put it back */
|
/* SP WARNING: BAD_SURROGATE_TAIL - use substitute character */
|
||||||
|
fprintf(stderr, "Warning: Leading(High) surrogate pair U+%04X, with no trailing(Low) entity, replaced with U+FFFD.\n", c1);
|
||||||
|
TY_(UngetChar)('&', doc->docIn); /* otherwise put it back */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* otherwise put it back */
|
/* put this non-entity lead char back */
|
||||||
TY_(UngetChar)(c1, doc->docIn);
|
TY_(UngetChar)(c1, doc->docIn);
|
||||||
|
/* Have leading surrogate pair, with no tail */
|
||||||
|
/* SP WARNING: BAD_SURROGATE_TAIL - use substitute character */
|
||||||
|
fprintf(stderr, "Warning: Leading(High) surrogate pair U+%04X, with no trailing(Low) entity, replaced with U+FFFD.\n", ch);
|
||||||
|
ch = 0xFFFD;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch))
|
||||||
|
{
|
||||||
|
/* Have trailing surrogate pair, with no lead */
|
||||||
|
/* SP WARNING: - BAD_SURROGATE_LEAD - - use substitute character */
|
||||||
|
fprintf(stderr, "Warning: Trailing (Low) surrogate pair U+%04X, with no leading (High) entity, replaced with U+FFFD.\n", ch);
|
||||||
|
ch = 0xFFFD;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* deal with unrecognized or invalid entities */
|
/* deal with unrecognized or invalid entities */
|
||||||
|
|
|
@ -186,6 +186,10 @@ typedef enum {
|
||||||
INVALID_URI,
|
INVALID_URI,
|
||||||
INVALID_NCR,
|
INVALID_NCR,
|
||||||
|
|
||||||
|
BAD_SURROGATE_PAIR,
|
||||||
|
BAD_SURROGATE_TAIL,
|
||||||
|
BAD_SURROGATE_LEAD,
|
||||||
|
|
||||||
/* This MUST be present and last. */
|
/* This MUST be present and last. */
|
||||||
CODES_TIDY_ERROR_LAST
|
CODES_TIDY_ERROR_LAST
|
||||||
} tidyErrorCodes;
|
} tidyErrorCodes;
|
||||||
|
|
Loading…
Reference in a new issue